From be21c0711ebde2858c633edf3010bcd41d78eede Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 11 Dec 2025 17:55:53 -0300 Subject: [PATCH 01/14] first attempt --- .github/workflows/check-colab-notebooks.yml | 48 ++ Makefile | 8 +- docs/colab_notebooks/1-the-basics.ipynb | 534 +++++++++++++++++ ...ctured-outputs-and-jinja-expressions.ipynb | 564 ++++++++++++++++++ .../3-seeding-with-a-dataset.ipynb | 462 ++++++++++++++ .../4-providing-images-as-context.ipynb | 527 ++++++++++++++++ .../4-providing-images-as-context.py | 6 - docs/overrides/main.html | 5 +- docs/scripts/generate_colab_notebooks.py | 180 ++++++ pyproject.toml | 1 + uv.lock | 100 ++++ 11 files changed, 2427 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/check-colab-notebooks.yml create mode 100644 docs/colab_notebooks/1-the-basics.ipynb create mode 100644 docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb create mode 100644 docs/colab_notebooks/3-seeding-with-a-dataset.ipynb create mode 100644 docs/colab_notebooks/4-providing-images-as-context.ipynb create mode 100644 docs/scripts/generate_colab_notebooks.py diff --git a/.github/workflows/check-colab-notebooks.yml b/.github/workflows/check-colab-notebooks.yml new file mode 100644 index 00000000..bb64ecc1 --- /dev/null +++ b/.github/workflows/check-colab-notebooks.yml @@ -0,0 +1,48 @@ +name: Check Colab notebooks + +on: + push: + branches: [ main ] + paths: + - 'docs/notebook_source/*.py' + pull_request: + branches: [ main ] + paths: + - 'docs/notebook_source/*.py' + workflow_dispatch: + +jobs: + check-colab-notebooks: + name: Check Colab Notebooks + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + version: "latest" + python-version: "3.11" + enable-cache: true + + - name: Install dependencies + run: | + uv sync --group notebooks + + - name: Generate Colab notebooks + run: | + make generate-colab-notebooks + + - name: Check for differences + run: | + if git diff --exit-code docs/colab_notebooks/; then + echo "βœ… Colab notebooks are up-to-date" + else + echo "❌ Colab notebooks are out of sync with source files" + echo "" + echo "The generated notebooks differ from the committed ones." + echo "Please run 'make generate-colab-notebooks' locally and commit the changes." + exit 1 + fi diff --git a/Makefile b/Makefile index 5dd1f59d..295400ce 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,7 @@ help: @echo "πŸ› οΈ Utilities:" @echo " clean - Remove coverage reports and cache files" @echo " convert-execute-notebooks - Convert notebooks from .py to .ipynb using jupytext" + @echo " generate-colab-notebooks - Generate Colab-compatible notebooks" @echo " serve-docs-locally - Serve documentation locally" @echo " check-license-headers - Check if all files have license headers" @echo " update-license-headers - Add license headers to all files" @@ -95,6 +96,11 @@ convert-execute-notebooks: rm docs/notebook_source/*.csv @echo "βœ… Notebooks created in docs/notebooks/" +generate-colab-notebooks: + @echo "πŸ““ Generating Colab-compatible notebooks..." + uv run --group notebooks python docs/scripts/generate_colab_notebooks.py + @echo "βœ… Colab notebooks created in docs/colab_notebooks/" + serve-docs-locally: @echo "πŸ“ Building and serving docs..." uv sync --group docs @@ -125,4 +131,4 @@ install-dev-notebooks: $(call install-pre-commit-hooks) @echo "βœ… Dev + notebooks installation complete!" -.PHONY: clean coverage format format-check lint lint-fix test check-license-headers update-license-headers check-all check-all-fix install install-dev install-dev-notebooks +.PHONY: clean coverage format format-check lint lint-fix test check-license-headers update-license-headers check-all check-all-fix install install-dev install-dev-notebooks generate-colab-notebooks diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb new file mode 100644 index 00000000..5d4c30ad --- /dev/null +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -0,0 +1,534 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9da2fbbf", + "metadata": {}, + "source": [ + "# 🎨 Data Designer Tutorial: The Basics\n", + "\n", + "#### πŸ“š What you'll learn\n", + "\n", + "This notebook demonstrates the basics of Data Designer by generating a simple product review dataset.\n" + ] + }, + { + "cell_type": "markdown", + "id": "53cc83fe", + "metadata": {}, + "source": [ + "### ⚑ Colab Setup\n", + "\n", + "Run the cells below to set up the environment for Google Colab.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ec209f1", + "metadata": {}, + "outputs": [], + "source": [ + "# Install data-designer and dependencies\n", + "!pip install -q data-designer \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "636b7151", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up NVIDIA API key from Colab secrets\n", + "from google.colab import userdata\n", + "\n", + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "8572f99c", + "metadata": {}, + "source": [ + "### πŸ“¦ Import the essentials\n", + "\n", + "- The `essentials` module provides quick access to the most commonly used objects.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b80a6d6", + "metadata": {}, + "outputs": [], + "source": [ + "from data_designer.essentials import (\n", + " CategorySamplerParams,\n", + " DataDesigner,\n", + " DataDesignerConfigBuilder,\n", + " InferenceParameters,\n", + " LLMTextColumnConfig,\n", + " ModelConfig,\n", + " PersonFromFakerSamplerParams,\n", + " SamplerColumnConfig,\n", + " SamplerType,\n", + " SubcategorySamplerParams,\n", + " UniformSamplerParams,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a07f86e7", + "metadata": {}, + "source": [ + "### βš™οΈ Initialize the Data Designer interface\n", + "\n", + "- `DataDesigner` is the main object is responsible for managing the data generation process.\n", + "\n", + "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9b6f8dd", + "metadata": {}, + "outputs": [], + "source": [ + "data_designer = DataDesigner()" + ] + }, + { + "cell_type": "markdown", + "id": "0e264f8b", + "metadata": {}, + "source": [ + "### πŸŽ›οΈ Define model configurations\n", + "\n", + "- Each `ModelConfig` defines a model that can be used during the generation process.\n", + "\n", + "- The \"model alias\" is used to reference the model in the Data Designer config (as we will see below).\n", + "\n", + "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details).\n", + "\n", + "- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0982db6a", + "metadata": {}, + "outputs": [], + "source": [ + "# This name is set in the model provider configuration.\n", + "MODEL_PROVIDER = \"nvidia\"\n", + "\n", + "# The model ID is from build.nvidia.com.\n", + "MODEL_ID = \"nvidia/nvidia-nemotron-nano-9b-v2\"\n", + "\n", + "# We choose this alias to be descriptive for our use case.\n", + "MODEL_ALIAS = \"nemotron-nano-v2\"\n", + "\n", + "# This sets reasoning to False for the nemotron-nano-v2 model.\n", + "SYSTEM_PROMPT = \"/no_think\"\n", + "\n", + "model_configs = [\n", + " ModelConfig(\n", + " alias=MODEL_ALIAS,\n", + " model=MODEL_ID,\n", + " provider=MODEL_PROVIDER,\n", + " inference_parameters=InferenceParameters(\n", + " temperature=0.5,\n", + " top_p=1.0,\n", + " max_tokens=1024,\n", + " ),\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "e39ae0f0", + "metadata": {}, + "source": [ + "### πŸ—οΈ Initialize the Data Designer Config Builder\n", + "\n", + "- The Data Designer config defines the dataset schema and generation process.\n", + "\n", + "- The config builder provides an intuitive interface for building this configuration.\n", + "\n", + "- The list of model configs is provided to the builder at initialization.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a2ce6", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder = DataDesignerConfigBuilder(model_configs=model_configs)" + ] + }, + { + "cell_type": "markdown", + "id": "3c829214", + "metadata": {}, + "source": [ + "## 🎲 Getting started with sampler columns\n", + "\n", + "- Sampler columns offer non-LLM based generation of synthetic data.\n", + "\n", + "- They are particularly useful for **steering the diversity** of the generated data, as we demonstrate below.\n", + "\n", + "
\n", + "\n", + "You can view available samplers using the config builder's `info` property:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3afafa39", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder.info.display(\"samplers\")" + ] + }, + { + "cell_type": "markdown", + "id": "d774fe6b", + "metadata": {}, + "source": [ + "Let's start designing our product review dataset by adding product category and subcategory columns.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb8ff044", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"product_category\",\n", + " sampler_type=SamplerType.CATEGORY,\n", + " params=CategorySamplerParams(\n", + " values=[\n", + " \"Electronics\",\n", + " \"Clothing\",\n", + " \"Home & Kitchen\",\n", + " \"Books\",\n", + " \"Home Office\",\n", + " ],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"product_subcategory\",\n", + " sampler_type=SamplerType.SUBCATEGORY,\n", + " params=SubcategorySamplerParams(\n", + " category=\"product_category\",\n", + " values={\n", + " \"Electronics\": [\n", + " \"Smartphones\",\n", + " \"Laptops\",\n", + " \"Headphones\",\n", + " \"Cameras\",\n", + " \"Accessories\",\n", + " ],\n", + " \"Clothing\": [\n", + " \"Men's Clothing\",\n", + " \"Women's Clothing\",\n", + " \"Winter Coats\",\n", + " \"Activewear\",\n", + " \"Accessories\",\n", + " ],\n", + " \"Home & Kitchen\": [\n", + " \"Appliances\",\n", + " \"Cookware\",\n", + " \"Furniture\",\n", + " \"Decor\",\n", + " \"Organization\",\n", + " ],\n", + " \"Books\": [\n", + " \"Fiction\",\n", + " \"Non-Fiction\",\n", + " \"Self-Help\",\n", + " \"Textbooks\",\n", + " \"Classics\",\n", + " ],\n", + " \"Home Office\": [\n", + " \"Desks\",\n", + " \"Chairs\",\n", + " \"Storage\",\n", + " \"Office Supplies\",\n", + " \"Lighting\",\n", + " ],\n", + " },\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"target_age_range\",\n", + " sampler_type=SamplerType.CATEGORY,\n", + " params=CategorySamplerParams(values=[\"18-25\", \"25-35\", \"35-50\", \"50-65\", \"65+\"]),\n", + " )\n", + ")\n", + "\n", + "# Optionally validate that the columns are configured correctly.\n", + "config_builder.validate()" + ] + }, + { + "cell_type": "markdown", + "id": "6851c834", + "metadata": {}, + "source": [ + "Next, let's add samplers to generate data related to the customer and their review.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4832cab", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"customer\",\n", + " sampler_type=SamplerType.PERSON_FROM_FAKER,\n", + " params=PersonFromFakerSamplerParams(age_range=[18, 70], locale=\"en_US\"),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"number_of_stars\",\n", + " sampler_type=SamplerType.UNIFORM,\n", + " params=UniformSamplerParams(low=1, high=5),\n", + " convert_to=\"int\", # Convert the sampled float to an integer.\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"review_style\",\n", + " sampler_type=SamplerType.CATEGORY,\n", + " params=CategorySamplerParams(\n", + " values=[\"rambling\", \"brief\", \"detailed\", \"structured with bullet points\"],\n", + " weights=[1, 2, 2, 1],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.validate()" + ] + }, + { + "cell_type": "markdown", + "id": "205cd3e4", + "metadata": {}, + "source": [ + "## 🦜 LLM-generated columns\n", + "\n", + "- The real power of Data Designer comes from leveraging LLMs to generate text, code, and structured data.\n", + "\n", + "- When prompting the LLM, we can use Jinja templating to reference other columns in the dataset.\n", + "\n", + "- As we see below, nested json fields can be accessed using dot notation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d140be8a", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder.add_column(\n", + " LLMTextColumnConfig(\n", + " name=\"product_name\",\n", + " prompt=(\n", + " \"You are a helpful assistant that generates product names. DO NOT add quotes around the product name.\\n\\n\"\n", + " \"Come up with a creative product name for a product in the '{{ product_category }}' category, focusing \"\n", + " \"on products related to '{{ product_subcategory }}'. The target age range of the ideal customer is \"\n", + " \"{{ target_age_range }} years old. Respond with only the product name, no other text.\"\n", + " ),\n", + " system_prompt=SYSTEM_PROMPT,\n", + " model_alias=MODEL_ALIAS,\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " LLMTextColumnConfig(\n", + " name=\"customer_review\",\n", + " prompt=(\n", + " \"You are a customer named {{ customer.first_name }} from {{ customer.city }}, {{ customer.state }}. \"\n", + " \"You are {{ customer.age }} years old and recently purchased a product called {{ product_name }}. \"\n", + " \"Write a review of this product, which you gave a rating of {{ number_of_stars }} stars. \"\n", + " \"The style of the review should be '{{ review_style }}'.\"\n", + " ),\n", + " system_prompt=SYSTEM_PROMPT,\n", + " model_alias=MODEL_ALIAS,\n", + " )\n", + ")\n", + "\n", + "config_builder.validate()" + ] + }, + { + "cell_type": "markdown", + "id": "543509ef", + "metadata": {}, + "source": [ + "### πŸ” Iteration is key – preview the dataset!\n", + "\n", + "1. Use the `preview` method to generate a sample of records quickly.\n", + "\n", + "2. Inspect the results for quality and format issues.\n", + "\n", + "3. Adjust column configurations, prompts, or parameters as needed.\n", + "\n", + "4. Re-run the preview until satisfied.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d2a8ed5", + "metadata": {}, + "outputs": [], + "source": [ + "preview = data_designer.preview(config_builder, num_records=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dca570cd", + "metadata": {}, + "outputs": [], + "source": [ + "# Run this cell multiple times to cycle through the 2 preview records.\n", + "preview.display_sample_record()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e816243", + "metadata": {}, + "outputs": [], + "source": [ + "# The preview dataset is available as a pandas DataFrame.\n", + "preview.dataset" + ] + }, + { + "cell_type": "markdown", + "id": "03b437c0", + "metadata": {}, + "source": [ + "### πŸ“Š Analyze the generated data\n", + "\n", + "- Data Designer automatically generates a basic statistical analysis of the generated data.\n", + "\n", + "- This analysis is available via the `analysis` property of generation result objects.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7aface55", + "metadata": {}, + "outputs": [], + "source": [ + "# Print the analysis as a table.\n", + "preview.analysis.to_report()" + ] + }, + { + "cell_type": "markdown", + "id": "82af5b0f", + "metadata": {}, + "source": [ + "### πŸ†™ Scale up!\n", + "\n", + "- Happy with your preview data?\n", + "\n", + "- Use the `create` method to submit larger Data Designer generation jobs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32e856ad", + "metadata": {}, + "outputs": [], + "source": [ + "results = data_designer.create(config_builder, num_records=10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a9d6d09", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the generated dataset as a pandas DataFrame.\n", + "dataset = results.load_dataset()\n", + "\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9955c7ea", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the analysis results into memory.\n", + "analysis = results.load_analysis()\n", + "\n", + "analysis.to_report()" + ] + }, + { + "cell_type": "markdown", + "id": "6c233d3d", + "metadata": {}, + "source": [ + "## ⏭️ Next Steps\n", + "\n", + "Now that you've seen the basics of Data Designer, check out the following notebooks to learn more about:\n", + "\n", + "- [Structured outputs and jinja expressions](/notebooks/2-structured-outputs-and-jinja-expressions/)\n", + "\n", + "- [Seeding synthetic data generation with an external dataset](/notebooks/3-seeding-with-a-dataset/)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb new file mode 100644 index 00000000..8fcaec40 --- /dev/null +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -0,0 +1,564 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "86ed3c32", + "metadata": {}, + "source": [ + "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", + "\n", + "#### πŸ“š What you'll learn\n", + "\n", + "In this notebook, we will continue our exploration of Data Designer, demonstrating more advanced data generation using structured outputs and Jinja expressions.\n", + "\n", + "If this is your first time using Data Designer, we recommend starting with the [first notebook](/notebooks/1-the-basics/) in this tutorial series.\n" + ] + }, + { + "cell_type": "markdown", + "id": "d4ed5a85", + "metadata": {}, + "source": [ + "### ⚑ Colab Setup\n", + "\n", + "Run the cells below to set up the environment for Google Colab.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d7db456", + "metadata": {}, + "outputs": [], + "source": [ + "# Install data-designer and dependencies\n", + "!pip install -q data-designer \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bd0ce7f", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up NVIDIA API key from Colab secrets\n", + "from google.colab import userdata\n", + "\n", + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "b3c71fba", + "metadata": {}, + "source": [ + "### πŸ“¦ Import the essentials\n", + "\n", + "- The `essentials` module provides quick access to the most commonly used objects.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6a3cbe2", + "metadata": {}, + "outputs": [], + "source": [ + "from data_designer.essentials import (\n", + " CategorySamplerParams,\n", + " DataDesigner,\n", + " DataDesignerConfigBuilder,\n", + " ExpressionColumnConfig,\n", + " InferenceParameters,\n", + " LLMStructuredColumnConfig,\n", + " ModelConfig,\n", + " PersonFromFakerSamplerParams,\n", + " SamplerColumnConfig,\n", + " SamplerType,\n", + " SubcategorySamplerParams,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9678504c", + "metadata": {}, + "source": [ + "### βš™οΈ Initialize the Data Designer interface\n", + "\n", + "- `DataDesigner` is the main object that is used to interface with the library.\n", + "\n", + "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b583355", + "metadata": {}, + "outputs": [], + "source": [ + "data_designer_client = DataDesigner()" + ] + }, + { + "cell_type": "markdown", + "id": "a2510497", + "metadata": {}, + "source": [ + "### πŸŽ›οΈ Define model configurations\n", + "\n", + "- Each `ModelConfig` defines a model that can be used during the generation process.\n", + "\n", + "- The \"model alias\" is used to reference the model in the Data Designer config (as we will see below).\n", + "\n", + "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details).\n", + "\n", + "- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1616b279", + "metadata": {}, + "outputs": [], + "source": [ + "# This name is set in the model provider configuration.\n", + "MODEL_PROVIDER = \"nvidia\"\n", + "\n", + "# The model ID is from build.nvidia.com.\n", + "MODEL_ID = \"nvidia/nvidia-nemotron-nano-9b-v2\"\n", + "\n", + "# We choose this alias to be descriptive for our use case.\n", + "MODEL_ALIAS = \"nemotron-nano-v2\"\n", + "\n", + "# This sets reasoning to False for the nemotron-nano-v2 model.\n", + "SYSTEM_PROMPT = \"/no_think\"\n", + "\n", + "model_configs = [\n", + " ModelConfig(\n", + " alias=MODEL_ALIAS,\n", + " model=MODEL_ID,\n", + " provider=MODEL_PROVIDER,\n", + " inference_parameters=InferenceParameters(\n", + " temperature=0.5,\n", + " top_p=1.0,\n", + " max_tokens=1024,\n", + " ),\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "ea5fca4a", + "metadata": {}, + "source": [ + "### πŸ—οΈ Initialize the Data Designer Config Builder\n", + "\n", + "- The Data Designer config defines the dataset schema and generation process.\n", + "\n", + "- The config builder provides an intuitive interface for building this configuration.\n", + "\n", + "- The list of model configs is provided to the builder at initialization.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b56cacd", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder = DataDesignerConfigBuilder(model_configs=model_configs)" + ] + }, + { + "cell_type": "markdown", + "id": "f7823592", + "metadata": {}, + "source": [ + "### πŸ§‘β€πŸŽ¨ Designing our data\n", + "\n", + "- We will again create a product review dataset, but this time we will use structured outputs and Jinja expressions.\n", + "\n", + "- Structured outputs let you specify the exact schema of the data you want to generate.\n", + "\n", + "- Data Designer supports schemas specified using either json schema or Pydantic data models (recommended).\n", + "\n", + "
\n", + "\n", + "We'll define our structured outputs using [Pydantic](https://docs.pydantic.dev/latest/) data models\n", + "\n", + "> πŸ’‘ **Why Pydantic?**\n", + ">\n", + "> - Pydantic models provide better IDE support and type validation.\n", + ">\n", + "> - They are more Pythonic than raw JSON schemas.\n", + ">\n", + "> - They integrate seamlessly with Data Designer's structured output system.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "395a420c", + "metadata": {}, + "outputs": [], + "source": [ + "from decimal import Decimal\n", + "from typing import Literal\n", + "\n", + "from pydantic import BaseModel, Field\n", + "\n", + "\n", + "# We define a Product schema so that the name, description, and price are generated\n", + "# in one go, with the types and constraints specified.\n", + "class Product(BaseModel):\n", + " name: str = Field(description=\"The name of the product\")\n", + " description: str = Field(description=\"A description of the product\")\n", + " price: Decimal = Field(description=\"The price of the product\", ge=10, le=1000, decimal_places=2)\n", + "\n", + "\n", + "class ProductReview(BaseModel):\n", + " rating: int = Field(description=\"The rating of the product\", ge=1, le=5)\n", + " customer_mood: Literal[\"irritated\", \"mad\", \"happy\", \"neutral\", \"excited\"] = Field(\n", + " description=\"The mood of the customer\"\n", + " )\n", + " review: str = Field(description=\"A review of the product\")" + ] + }, + { + "cell_type": "markdown", + "id": "20f34cc8", + "metadata": {}, + "source": [ + "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdc1f037", + "metadata": {}, + "outputs": [], + "source": [ + "# Since we often only want a few attributes from Person objects, we can\n", + "# set drop=True in the column config to drop the column from the final dataset.\n", + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"customer\",\n", + " sampler_type=SamplerType.PERSON_FROM_FAKER,\n", + " params=PersonFromFakerSamplerParams(),\n", + " drop=True,\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"product_category\",\n", + " sampler_type=SamplerType.CATEGORY,\n", + " params=CategorySamplerParams(\n", + " values=[\n", + " \"Electronics\",\n", + " \"Clothing\",\n", + " \"Home & Kitchen\",\n", + " \"Books\",\n", + " \"Home Office\",\n", + " ],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"product_subcategory\",\n", + " sampler_type=SamplerType.SUBCATEGORY,\n", + " params=SubcategorySamplerParams(\n", + " category=\"product_category\",\n", + " values={\n", + " \"Electronics\": [\n", + " \"Smartphones\",\n", + " \"Laptops\",\n", + " \"Headphones\",\n", + " \"Cameras\",\n", + " \"Accessories\",\n", + " ],\n", + " \"Clothing\": [\n", + " \"Men's Clothing\",\n", + " \"Women's Clothing\",\n", + " \"Winter Coats\",\n", + " \"Activewear\",\n", + " \"Accessories\",\n", + " ],\n", + " \"Home & Kitchen\": [\n", + " \"Appliances\",\n", + " \"Cookware\",\n", + " \"Furniture\",\n", + " \"Decor\",\n", + " \"Organization\",\n", + " ],\n", + " \"Books\": [\n", + " \"Fiction\",\n", + " \"Non-Fiction\",\n", + " \"Self-Help\",\n", + " \"Textbooks\",\n", + " \"Classics\",\n", + " ],\n", + " \"Home Office\": [\n", + " \"Desks\",\n", + " \"Chairs\",\n", + " \"Storage\",\n", + " \"Office Supplies\",\n", + " \"Lighting\",\n", + " ],\n", + " },\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"target_age_range\",\n", + " sampler_type=SamplerType.CATEGORY,\n", + " params=CategorySamplerParams(values=[\"18-25\", \"25-35\", \"35-50\", \"50-65\", \"65+\"]),\n", + " )\n", + ")\n", + "\n", + "# Sampler columns support conditional params, which are used if the condition is met.\n", + "# In this example, we set the review style to rambling if the target age range is 18-25.\n", + "# Note conditional parameters are only supported for Sampler column types.\n", + "config_builder.add_column(\n", + " SamplerColumnConfig(\n", + " name=\"review_style\",\n", + " sampler_type=SamplerType.CATEGORY,\n", + " params=CategorySamplerParams(\n", + " values=[\"rambling\", \"brief\", \"detailed\", \"structured with bullet points\"],\n", + " weights=[1, 2, 2, 1],\n", + " ),\n", + " conditional_params={\n", + " \"target_age_range == '18-25'\": CategorySamplerParams(values=[\"rambling\"]),\n", + " },\n", + " )\n", + ")\n", + "\n", + "# Optionally validate that the columns are configured correctly.\n", + "config_builder.validate()" + ] + }, + { + "cell_type": "markdown", + "id": "ae1ff751", + "metadata": {}, + "source": [ + "Next, we will use more advanced Jinja expressions to create new columns.\n", + "\n", + "Jinja expressions let you:\n", + "\n", + "- Access nested attributes: `{{ customer.first_name }}`\n", + "\n", + "- Combine values: `{{ customer.first_name }} {{ customer.last_name }}`\n", + "\n", + "- Use conditional logic: `{% if condition %}...{% endif %}`\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccd831aa", + "metadata": {}, + "outputs": [], + "source": [ + "# We can create new columns using Jinja expressions that reference\n", + "# existing columns, including attributes of nested objects.\n", + "config_builder.add_column(\n", + " ExpressionColumnConfig(name=\"customer_name\", expr=\"{{ customer.first_name }} {{ customer.last_name }}\")\n", + ")\n", + "\n", + "config_builder.add_column(ExpressionColumnConfig(name=\"customer_age\", expr=\"{{ customer.age }}\"))\n", + "\n", + "config_builder.add_column(\n", + " LLMStructuredColumnConfig(\n", + " name=\"product\",\n", + " prompt=(\n", + " \"Create a product in the '{{ product_category }}' category, focusing on products \"\n", + " \"related to '{{ product_subcategory }}'. The target age range of the ideal customer is \"\n", + " \"{{ target_age_range }} years old. The product should be priced between $10 and $1000.\"\n", + " ),\n", + " system_prompt=SYSTEM_PROMPT,\n", + " output_format=Product,\n", + " model_alias=MODEL_ALIAS,\n", + " )\n", + ")\n", + "\n", + "# We can even use if/else logic in our Jinja expressions to create more complex prompt patterns.\n", + "config_builder.add_column(\n", + " LLMStructuredColumnConfig(\n", + " name=\"customer_review\",\n", + " prompt=(\n", + " \"Your task is to write a review for the following product:\\n\\n\"\n", + " \"Product Name: {{ product.name }}\\n\"\n", + " \"Product Description: {{ product.description }}\\n\"\n", + " \"Price: {{ product.price }}\\n\\n\"\n", + " \"Imagine your name is {{ customer_name }} and you are from {{ customer.city }}, {{ customer.state }}. \"\n", + " \"Write the review in a style that is '{{ review_style }}'.\"\n", + " \"{% if target_age_range == '18-25' %}\"\n", + " \"Make sure the review is more informal and conversational.\"\n", + " \"{% else %}\"\n", + " \"Make sure the review is more formal and structured.\"\n", + " \"{% endif %}\"\n", + " ),\n", + " system_prompt=SYSTEM_PROMPT,\n", + " output_format=ProductReview,\n", + " model_alias=MODEL_ALIAS,\n", + " )\n", + ")\n", + "\n", + "config_builder.validate()" + ] + }, + { + "cell_type": "markdown", + "id": "f9efe1e4", + "metadata": {}, + "source": [ + "### πŸ” Iteration is key – preview the dataset!\n", + "\n", + "1. Use the `preview` method to generate a sample of records quickly.\n", + "\n", + "2. Inspect the results for quality and format issues.\n", + "\n", + "3. Adjust column configurations, prompts, or parameters as needed.\n", + "\n", + "4. Re-run the preview until satisfied.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "054ff544", + "metadata": {}, + "outputs": [], + "source": [ + "preview = data_designer_client.preview(config_builder, num_records=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e33f6cba", + "metadata": {}, + "outputs": [], + "source": [ + "# Run this cell multiple times to cycle through the 2 preview records.\n", + "preview.display_sample_record()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f896dbac", + "metadata": {}, + "outputs": [], + "source": [ + "# The preview dataset is available as a pandas DataFrame.\n", + "preview.dataset" + ] + }, + { + "cell_type": "markdown", + "id": "85e72e6e", + "metadata": {}, + "source": [ + "### πŸ“Š Analyze the generated data\n", + "\n", + "- Data Designer automatically generates a basic statistical analysis of the generated data.\n", + "\n", + "- This analysis is available via the `analysis` property of generation result objects.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "378d027e", + "metadata": {}, + "outputs": [], + "source": [ + "# Print the analysis as a table.\n", + "preview.analysis.to_report()" + ] + }, + { + "cell_type": "markdown", + "id": "c1442a6d", + "metadata": {}, + "source": [ + "### πŸ†™ Scale up!\n", + "\n", + "- Happy with your preview data?\n", + "\n", + "- Use the `create` method to submit larger Data Designer generation jobs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aee71736", + "metadata": {}, + "outputs": [], + "source": [ + "job_results = data_designer_client.create(config_builder, num_records=10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71364f16", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the generated dataset as a pandas DataFrame.\n", + "dataset = job_results.load_dataset()\n", + "\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a57dff2", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the analysis results into memory.\n", + "analysis = job_results.load_analysis()\n", + "\n", + "analysis.to_report()" + ] + }, + { + "cell_type": "markdown", + "id": "bd540227", + "metadata": {}, + "source": [ + "## ⏭️ Next Steps\n", + "\n", + "Check out the following notebook to learn more about:\n", + "\n", + "- [Seeding synthetic data generation with an external dataset](/notebooks/3-seeding-with-a-dataset/)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb new file mode 100644 index 00000000..6fbdc6dd --- /dev/null +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -0,0 +1,462 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3fec7172", + "metadata": {}, + "source": [ + "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", + "\n", + "#### πŸ“š What you'll learn\n", + "\n", + "In this notebook, we will demonstrate how to seed synthetic data generation in Data Designer with an external dataset.\n", + "\n", + "If this is your first time using Data Designer, we recommend starting with the [first notebook](/notebooks/1-the-basics/) in this tutorial series.\n" + ] + }, + { + "cell_type": "markdown", + "id": "3a00d147", + "metadata": {}, + "source": [ + "### ⚑ Colab Setup\n", + "\n", + "Run the cells below to set up the environment for Google Colab.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f02cd4e", + "metadata": {}, + "outputs": [], + "source": [ + "# Install data-designer and dependencies\n", + "!pip install -q data-designer \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b77a2c6e", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up NVIDIA API key from Colab secrets\n", + "from google.colab import userdata\n", + "\n", + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "568cebb4", + "metadata": {}, + "source": [ + "### πŸ“¦ Import the essentials\n", + "\n", + "- The `essentials` module provides quick access to the most commonly used objects.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "240cc94a", + "metadata": {}, + "outputs": [], + "source": [ + "from data_designer.essentials import (\n", + " DataDesigner,\n", + " DataDesignerConfigBuilder,\n", + " InferenceParameters,\n", + " ModelConfig,\n", + " SeedConfig,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ae03b63c", + "metadata": {}, + "source": [ + "### βš™οΈ Initialize the Data Designer interface\n", + "\n", + "- `DataDesigner` is the main object is responsible for managing the data generation process.\n", + "\n", + "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56e14153", + "metadata": {}, + "outputs": [], + "source": [ + "data_designer_client = DataDesigner()" + ] + }, + { + "cell_type": "markdown", + "id": "8edc3dd9", + "metadata": {}, + "source": [ + "### πŸŽ›οΈ Define model configurations\n", + "\n", + "- Each `ModelConfig` defines a model that can be used during the generation process.\n", + "\n", + "- The \"model alias\" is used to reference the model in the Data Designer config (as we will see below).\n", + "\n", + "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details).\n", + "\n", + "- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "843b1bf7", + "metadata": {}, + "outputs": [], + "source": [ + "# This name is set in the model provider configuration.\n", + "MODEL_PROVIDER = \"nvidia\"\n", + "\n", + "# The model ID is from build.nvidia.com.\n", + "MODEL_ID = \"nvidia/nvidia-nemotron-nano-9b-v2\"\n", + "\n", + "# We choose this alias to be descriptive for our use case.\n", + "MODEL_ALIAS = \"nemotron-nano-v2\"\n", + "\n", + "# This sets reasoning to False for the nemotron-nano-v2 model.\n", + "SYSTEM_PROMPT = \"/no_think\"\n", + "\n", + "model_configs = [\n", + " ModelConfig(\n", + " alias=MODEL_ALIAS,\n", + " model=MODEL_ID,\n", + " provider=MODEL_PROVIDER,\n", + " inference_parameters=InferenceParameters(\n", + " temperature=0.5,\n", + " top_p=1.0,\n", + " max_tokens=1024,\n", + " ),\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "08cffbb3", + "metadata": {}, + "source": [ + "### πŸ—οΈ Initialize the Data Designer Config Builder\n", + "\n", + "- The Data Designer config defines the dataset schema and generation process.\n", + "\n", + "- The config builder provides an intuitive interface for building this configuration.\n", + "\n", + "- The list of model configs is provided to the builder at initialization.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76116f51", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder = DataDesignerConfigBuilder(model_configs=model_configs)" + ] + }, + { + "cell_type": "markdown", + "id": "99b3b740", + "metadata": {}, + "source": [ + "## πŸ₯ Prepare a seed dataset\n", + "\n", + "- For this notebook, we'll create a synthetic dataset of patient notes.\n", + "\n", + "- We will _seed_ the generation process with a [symptom-to-diagnosis dataset](https://huggingface.co/datasets/gretelai/symptom_to_diagnosis).\n", + "\n", + "- We already have the dataset downloaded in the [data](../data) directory of this repository.\n", + "\n", + "
\n", + "\n", + "> 🌱 **Why use a seed dataset?**\n", + ">\n", + "> - Seed datasets let you steer the generation process by providing context that is specific to your use case.\n", + ">\n", + "> - Seed datasets are also an excellent way to inject real-world diversity into your synthetic data.\n", + ">\n", + "> - During generation, prompt templates can reference any of the seed dataset fields.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "616811de", + "metadata": {}, + "outputs": [], + "source": [ + "# Download sample dataset from Github\n", + "import urllib.request\n", + "\n", + "url = \"https://raw.githubusercontent.com/NVIDIA/GenerativeAIExamples/refs/heads/main/nemo/NeMo-Data-Designer/data/gretelai_symptom_to_diagnosis.csv\"\n", + "local_filename, headers = urllib.request.urlretrieve(url, \"gretelai_symptom_to_diagnosis.csv\")\n", + "\n", + "seed_dataset = SeedConfig(dataset=local_filename)\n", + "\n", + "# Pass the reference to the config builder for use during generation.\n", + "config_builder.with_seed_dataset(seed_dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "71f13ab9", + "metadata": {}, + "source": [ + "## 🎨 Designing our synthetic patient notes dataset\n", + "\n", + "- Here we use `add_column` with keyword arguments (rather than imported config objects).\n", + "\n", + "- Generally, we recommend using concrete objects, but this is a convenient shorthand.\n", + "\n", + "- **Note**: The prompt template can reference fields from our seed dataset:\n", + " - `{{ diagnosis }}` - the medical diagnosis from the seed data\n", + " - `{{ patient_summary }}` - the symptom description from the seed data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d2e7438", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder.add_column(\n", + " name=\"patient_sampler\",\n", + " column_type=\"sampler\",\n", + " sampler_type=\"person_from_faker\",\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " name=\"doctor_sampler\",\n", + " column_type=\"sampler\",\n", + " sampler_type=\"person_from_faker\",\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " name=\"patient_id\",\n", + " column_type=\"sampler\",\n", + " sampler_type=\"uuid\",\n", + " params={\n", + " \"prefix\": \"PT-\",\n", + " \"short_form\": True,\n", + " \"uppercase\": True,\n", + " },\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " name=\"first_name\",\n", + " column_type=\"expression\",\n", + " expr=\"{{ patient_sampler.first_name}}\",\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " name=\"last_name\",\n", + " column_type=\"expression\",\n", + " expr=\"{{ patient_sampler.last_name }}\",\n", + ")\n", + "\n", + "\n", + "config_builder.add_column(\n", + " name=\"dob\",\n", + " column_type=\"expression\",\n", + " expr=\"{{ patient_sampler.birth_date }}\",\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " name=\"symptom_onset_date\",\n", + " column_type=\"sampler\",\n", + " sampler_type=\"datetime\",\n", + " params={\"start\": \"2024-01-01\", \"end\": \"2024-12-31\"},\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " name=\"date_of_visit\",\n", + " column_type=\"sampler\",\n", + " sampler_type=\"timedelta\",\n", + " params={\"dt_min\": 1, \"dt_max\": 30, \"reference_column_name\": \"symptom_onset_date\"},\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " name=\"physician\",\n", + " column_type=\"expression\",\n", + " expr=\"Dr. {{ doctor_sampler.last_name }}\",\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " name=\"physician_notes\",\n", + " column_type=\"llm-text\",\n", + " prompt=\"\"\"\\\n", + "You are a primary-care physician who just had an appointment with {{ first_name }} {{ last_name }},\n", + "who has been struggling with symptoms from {{ diagnosis }} since {{ symptom_onset_date }}.\n", + "The date of today's visit is {{ date_of_visit }}.\n", + "\n", + "{{ patient_summary }}\n", + "\n", + "Write careful notes about your visit with {{ first_name }},\n", + "as Dr. {{ doctor_sampler.first_name }} {{ doctor_sampler.last_name }}.\n", + "\n", + "Format the notes as a busy doctor might.\n", + "\"\"\",\n", + " model_alias=MODEL_ALIAS,\n", + " system_prompt=SYSTEM_PROMPT,\n", + ")\n", + "\n", + "config_builder.validate()" + ] + }, + { + "cell_type": "markdown", + "id": "cb4eb347", + "metadata": {}, + "source": [ + "### πŸ” Iteration is key – preview the dataset!\n", + "\n", + "1. Use the `preview` method to generate a sample of records quickly.\n", + "\n", + "2. Inspect the results for quality and format issues.\n", + "\n", + "3. Adjust column configurations, prompts, or parameters as needed.\n", + "\n", + "4. Re-run the preview until satisfied.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55ff2e7f", + "metadata": {}, + "outputs": [], + "source": [ + "preview = data_designer_client.preview(config_builder, num_records=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25ab9d95", + "metadata": {}, + "outputs": [], + "source": [ + "# Run this cell multiple times to cycle through the 2 preview records.\n", + "preview.display_sample_record()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1e9b3a6", + "metadata": {}, + "outputs": [], + "source": [ + "# The preview dataset is available as a pandas DataFrame.\n", + "preview.dataset" + ] + }, + { + "cell_type": "markdown", + "id": "da16ff5a", + "metadata": {}, + "source": [ + "### πŸ“Š Analyze the generated data\n", + "\n", + "- Data Designer automatically generates a basic statistical analysis of the generated data.\n", + "\n", + "- This analysis is available via the `analysis` property of generation result objects.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82a12ed7", + "metadata": {}, + "outputs": [], + "source": [ + "# Print the analysis as a table.\n", + "preview.analysis.to_report()" + ] + }, + { + "cell_type": "markdown", + "id": "297eb1aa", + "metadata": {}, + "source": [ + "### πŸ†™ Scale up!\n", + "\n", + "- Happy with your preview data?\n", + "\n", + "- Use the `create` method to submit larger Data Designer generation jobs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47c20a9d", + "metadata": {}, + "outputs": [], + "source": [ + "job_results = data_designer_client.create(config_builder, num_records=10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "180e9cc2", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the generated dataset as a pandas DataFrame.\n", + "dataset = job_results.load_dataset()\n", + "\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61a840d6", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the analysis results into memory.\n", + "analysis = job_results.load_analysis()\n", + "\n", + "analysis.to_report()" + ] + }, + { + "cell_type": "markdown", + "id": "8bfdafde", + "metadata": {}, + "source": [ + "## ⏭️ Next Steps\n", + "\n", + "Use Data Designer to generate synthetic data for your specific use case!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb new file mode 100644 index 00000000..2f11d787 --- /dev/null +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -0,0 +1,527 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a7a02425", + "metadata": {}, + "source": [ + "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" + ] + }, + { + "cell_type": "markdown", + "id": "048244cc", + "metadata": {}, + "source": [ + "#### πŸ“š What you'll learn\n", + "\n", + "This notebook demonstrates how to provide images as context to generate text descriptions using vision-language models.\n", + "\n", + "- ✨ **Visual Document Processing**: Converting images to chat-ready format for model consumption\n", + "- πŸ” **Vision-Language Generation**: Using vision models to generate detailed summaries from images\n", + "\n", + "If this is your first time using Data Designer, we recommend starting with the [first notebook](/notebooks/1-the-basics/) in this tutorial series.\n" + ] + }, + { + "cell_type": "markdown", + "id": "d87b9fb3", + "metadata": {}, + "source": [ + "### ⚑ Colab Setup\n", + "\n", + "Run the cells below to set up the environment for Google Colab.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b04c6f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Install data-designer and dependencies\n", + "!pip install -q data-designer pillow\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "589de42d", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up NVIDIA API key from Colab secrets\n", + "from google.colab import userdata\n", + "\n", + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "65a793d7", + "metadata": {}, + "source": [ + "### πŸ“¦ Import the essentials\n", + "\n", + "- The `essentials` module provides quick access to the most commonly used objects.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df9164a4", + "metadata": {}, + "outputs": [], + "source": [ + "# Standard library imports\n", + "import base64\n", + "import io\n", + "import uuid\n", + "\n", + "# Third-party imports\n", + "import pandas as pd\n", + "import rich\n", + "from datasets import load_dataset\n", + "from IPython.display import display\n", + "from rich.panel import Panel\n", + "\n", + "# Data Designer imports\n", + "from data_designer.essentials import (\n", + " DataDesigner,\n", + " DataDesignerConfigBuilder,\n", + " ImageContext,\n", + " ImageFormat,\n", + " InferenceParameters,\n", + " LLMTextColumnConfig,\n", + " ModalityDataType,\n", + " ModelConfig,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8a9774fe", + "metadata": {}, + "source": [ + "### βš™οΈ Initialize the Data Designer interface\n", + "\n", + "- `DataDesigner` is the main object is responsible for managing the data generation process.\n", + "\n", + "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bf4c5f8", + "metadata": {}, + "outputs": [], + "source": [ + "data_designer = DataDesigner()" + ] + }, + { + "cell_type": "markdown", + "id": "ce1c435c", + "metadata": {}, + "source": [ + "### πŸŽ›οΈ Define model configurations\n", + "\n", + "- Each `ModelConfig` defines a model that can be used during the generation process.\n", + "\n", + "- The \"model alias\" is used to reference the model in the Data Designer config (as we will see below).\n", + "\n", + "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details).\n", + "\n", + "- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f966056", + "metadata": {}, + "outputs": [], + "source": [ + "# This name is set in the model provider configuration.\n", + "MODEL_PROVIDER = \"nvidia\"\n", + "\n", + "model_configs = [\n", + " ModelConfig(\n", + " alias=\"vision\",\n", + " model=\"meta/llama-4-scout-17b-16e-instruct\",\n", + " provider=MODEL_PROVIDER,\n", + " inference_parameters=InferenceParameters(\n", + " temperature=0.60,\n", + " top_p=0.95,\n", + " max_tokens=2048,\n", + " ),\n", + " ),\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "bc91ea68", + "metadata": {}, + "source": [ + "### πŸ—οΈ Initialize the Data Designer Config Builder\n", + "\n", + "- The Data Designer config defines the dataset schema and generation process.\n", + "\n", + "- The config builder provides an intuitive interface for building this configuration.\n", + "\n", + "- The list of model configs is provided to the builder at initialization.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72ae01dc", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder = DataDesignerConfigBuilder(model_configs=model_configs)" + ] + }, + { + "cell_type": "markdown", + "id": "1f05f2d3", + "metadata": {}, + "source": [ + "### 🌱 Seed Dataset Creation\n", + "\n", + "In this section, we'll prepare our visual documents as a seed dataset for summarization:\n", + "\n", + "- **Loading Visual Documents**: We use the ColPali dataset containing document images\n", + "- **Image Processing**: Convert images to base64 format for vision model consumption\n", + "- **Metadata Extraction**: Preserve relevant document information (filename, page number, source, etc.)\n", + "\n", + "The seed dataset will be used to generate detailed text summaries of each document image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cd6c722", + "metadata": {}, + "outputs": [], + "source": [ + "# Dataset processing configuration\n", + "IMG_COUNT = 512 # Number of images to process\n", + "BASE64_IMAGE_HEIGHT = 512 # Standardized height for model input\n", + "\n", + "# Load ColPali dataset for visual documents\n", + "img_dataset_cfg = {\"path\": \"vidore/colpali_train_set\", \"split\": \"train\", \"streaming\": True}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5bedd8e", + "metadata": {}, + "outputs": [], + "source": [ + "def resize_image(image, height: int):\n", + " \"\"\"\n", + " Resize image while maintaining aspect ratio.\n", + "\n", + " Args:\n", + " image: PIL Image object\n", + " height: Target height in pixels\n", + "\n", + " Returns:\n", + " Resized PIL Image object\n", + " \"\"\"\n", + " original_width, original_height = image.size\n", + " width = int(original_width * (height / original_height))\n", + " return image.resize((width, height))\n", + "\n", + "\n", + "def convert_image_to_chat_format(record, height: int) -> dict:\n", + " \"\"\"\n", + " Convert PIL image to base64 format for chat template usage.\n", + "\n", + " Args:\n", + " record: Dataset record containing image and metadata\n", + " height: Target height for image resizing\n", + "\n", + " Returns:\n", + " Updated record with base64_image and uuid fields\n", + " \"\"\"\n", + " # Resize image for consistent processing\n", + " image = resize_image(record[\"image\"], height)\n", + "\n", + " # Convert to base64 string\n", + " img_buffer = io.BytesIO()\n", + " image.save(img_buffer, format=\"PNG\")\n", + " byte_data = img_buffer.getvalue()\n", + " base64_encoded_data = base64.b64encode(byte_data)\n", + " base64_string = base64_encoded_data.decode(\"utf-8\")\n", + "\n", + " # Return updated record\n", + " return record | {\"base64_image\": base64_string, \"uuid\": str(uuid.uuid4())}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fef568a", + "metadata": {}, + "outputs": [], + "source": [ + "# Load and process the visual document dataset\n", + "print(\"πŸ“₯ Loading and processing document images...\")\n", + "\n", + "img_dataset_iter = iter(\n", + " load_dataset(**img_dataset_cfg).map(convert_image_to_chat_format, fn_kwargs={\"height\": BASE64_IMAGE_HEIGHT})\n", + ")\n", + "img_dataset = pd.DataFrame([next(img_dataset_iter) for _ in range(IMG_COUNT)])\n", + "\n", + "print(f\"βœ… Loaded {len(img_dataset)} images with columns: {list(img_dataset.columns)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8293659d", + "metadata": {}, + "outputs": [], + "source": [ + "img_dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "515ef6c8", + "metadata": {}, + "outputs": [], + "source": [ + "# Add the seed dataset containing our processed images\n", + "df_seed = pd.DataFrame(img_dataset)[[\"uuid\", \"image_filename\", \"base64_image\", \"page\", \"options\", \"source\"]]\n", + "config_builder.with_seed_dataset(\n", + " DataDesigner.make_seed_reference_from_dataframe(df_seed, file_path=\"colpali_train_set.csv\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f55543b", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# Add a column to generate detailed document summaries\n", + "config_builder.add_column(\n", + " LLMTextColumnConfig(\n", + " name=\"summary\",\n", + " model_alias=\"vision\",\n", + " prompt=(\n", + " \"Provide a detailed summary of the content in this image in Markdown format. \"\n", + " \"Start from the top of the image and then describe it from top to bottom. \"\n", + " \"Place a summary at the bottom.\"\n", + " ),\n", + " multi_modal_context=[\n", + " ImageContext(\n", + " column_name=\"base64_image\",\n", + " data_type=ModalityDataType.BASE64,\n", + " image_format=ImageFormat.PNG,\n", + " )\n", + " ],\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3504fd38", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [] + }, + { + "cell_type": "markdown", + "id": "f7806a5d", + "metadata": {}, + "source": [ + "### πŸ” Iteration is key – preview the dataset!\n", + "\n", + "1. Use the `preview` method to generate a sample of records quickly.\n", + "\n", + "2. Inspect the results for quality and format issues.\n", + "\n", + "3. Adjust column configurations, prompts, or parameters as needed.\n", + "\n", + "4. Re-run the preview until satisfied.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4172df5", + "metadata": {}, + "outputs": [], + "source": [ + "preview = data_designer.preview(config_builder, num_records=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd1fc8bf", + "metadata": {}, + "outputs": [], + "source": [ + "# Run this cell multiple times to cycle through the 2 preview records.\n", + "preview.display_sample_record()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d106f727", + "metadata": {}, + "outputs": [], + "source": [ + "# The preview dataset is available as a pandas DataFrame.\n", + "preview.dataset" + ] + }, + { + "cell_type": "markdown", + "id": "f6f83fb6", + "metadata": {}, + "source": [ + "### πŸ“Š Analyze the generated data\n", + "\n", + "- Data Designer automatically generates a basic statistical analysis of the generated data.\n", + "\n", + "- This analysis is available via the `analysis` property of generation result objects.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d69be80", + "metadata": {}, + "outputs": [], + "source": [ + "# Print the analysis as a table.\n", + "preview.analysis.to_report()" + ] + }, + { + "cell_type": "markdown", + "id": "b995dce4", + "metadata": {}, + "source": [ + "### πŸ”Ž Visual Inspection\n", + "\n", + "Let's compare the original document image with the generated summary to validate quality:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c157042b", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# Compare original document with generated summary\n", + "index = 0 # Change this to view different examples\n", + "\n", + "# Merge preview data with original images for comparison\n", + "comparison_dataset = preview.dataset.merge(pd.DataFrame(img_dataset)[[\"uuid\", \"image\"]], how=\"left\", on=\"uuid\")\n", + "\n", + "# Extract the record for display\n", + "record = comparison_dataset.iloc[index]\n", + "\n", + "print(\"πŸ“„ Original Document Image:\")\n", + "display(resize_image(record.image, BASE64_IMAGE_HEIGHT))\n", + "\n", + "print(\"\\nπŸ“ Generated Summary:\")\n", + "rich.print(Panel(record.summary, title=\"Document Summary\", title_align=\"left\"))" + ] + }, + { + "cell_type": "markdown", + "id": "9f29d990", + "metadata": {}, + "source": [ + "### πŸ†™ Scale up!\n", + "\n", + "- Happy with your preview data?\n", + "\n", + "- Use the `create` method to submit larger Data Designer generation jobs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2625415e", + "metadata": {}, + "outputs": [], + "source": [ + "results = data_designer.create(config_builder, num_records=10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "595f64fa", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the generated dataset as a pandas DataFrame.\n", + "dataset = results.load_dataset()\n", + "\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "422a375d", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the analysis results into memory.\n", + "analysis = results.load_analysis()\n", + "\n", + "analysis.to_report()" + ] + }, + { + "cell_type": "markdown", + "id": "8880d71e", + "metadata": {}, + "source": [ + "## ⏭️ Next Steps\n", + "\n", + "Now that you've learned how to use visual context for image summarization in Data Designer, explore more:\n", + "\n", + "- Experiment with different vision models for specific document types\n", + "- Try different prompt variations to generate specialized descriptions (e.g., technical details, key findings)\n", + "- Combine vision-based summaries with other column types for multi-modal workflows\n", + "- Apply this pattern to other vision tasks like image captioning, OCR validation, or visual question answering\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/notebook_source/4-providing-images-as-context.py b/docs/notebook_source/4-providing-images-as-context.py index 10afd4bd..dc2513cd 100644 --- a/docs/notebook_source/4-providing-images-as-context.py +++ b/docs/notebook_source/4-providing-images-as-context.py @@ -26,12 +26,6 @@ # If this is your first time using Data Designer, we recommend starting with the [first notebook](/notebooks/1-the-basics/) in this tutorial series. # -# %% [markdown] -# ### ⬇️ Install dependencies (if required) - -# %% -# !uv pip install pillow - # %% [markdown] # ### πŸ“¦ Import the essentials # diff --git a/docs/overrides/main.html b/docs/overrides/main.html index b2d13089..18759076 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -2,13 +2,16 @@ {% block outdated %} You're not viewing the latest version. - + Click here to go to latest. {% endblock %} {% block content %} {% if page.nb_url %} + + Open In Colab + {% include ".icons/material/download.svg" %} diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py new file mode 100644 index 00000000..8aafd5e3 --- /dev/null +++ b/docs/scripts/generate_colab_notebooks.py @@ -0,0 +1,180 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Script to generate Colab-compatible notebooks from notebook source files. + +This script processes jupytext percent-format Python files and: +1. Injects Colab-specific setup cells (pip install, API key from secrets) +2. Injects cells before the "Import the essentials" section +3. Saves the result as .ipynb files in docs/colab_notebooks +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +import jupytext +from nbformat import NotebookNode +from nbformat.v4 import new_code_cell, new_markdown_cell + + +IMPORT_SECTION_MARKER = "### πŸ“¦ Import the essentials" + +COLAB_SETUP_MARKDOWN = """\ +### ⚑ Colab Setup + +Run the cells below to set up the environment for Google Colab. +""" + +ADDITIONAL_DEPENDENCIES = { + "4-providing-images-as-context.py": "pillow", +} + +COLAB_INSTALL_CELL = """\ +# Install data-designer and dependencies +!pip install -q data-designer {} +""" + +COLAB_API_KEY_CELL = """\ +# Set up NVIDIA API key from Colab secrets +from google.colab import userdata + +import os +os.environ["NVIDIA_API_KEY"] = userdata.get("NVIDIA_API_KEY") +""" + + +def create_colab_setup_cells(additional_dependencies: str) -> list[NotebookNode]: + """Create the Colab-specific setup cells to inject before imports.""" + return [ + new_markdown_cell(source=COLAB_SETUP_MARKDOWN), + new_code_cell(source=COLAB_INSTALL_CELL.format(additional_dependencies)), + new_code_cell(source=COLAB_API_KEY_CELL), + ] + + +def find_import_section_index(cells: list[NotebookNode]) -> int: + """Find the index of the 'Import the essentials' markdown cell.""" + for i, cell in enumerate(cells): + if cell.get("cell_type") == "markdown": + source = cell.get("source", "") + if IMPORT_SECTION_MARKER in source: + return i + return -1 + + +def process_notebook(notebook: NotebookNode, source_path: Path) -> NotebookNode: + """Process a notebook to make it Colab-compatible. + + Args: + notebook: The input notebook + + Returns: + The processed notebook with Colab setup cells injected + """ + cells = notebook.cells + + additional_dependencies = ADDITIONAL_DEPENDENCIES.get(source_path.name, "") + + # Find where to insert Colab setup (before "Import the essentials") + import_idx = find_import_section_index(cells) + + if import_idx == -1: + # If not found, insert after first cell (title) + import_idx = 1 + + # Insert Colab setup cells before the import section + colab_cells = create_colab_setup_cells(additional_dependencies) + processed_cells = cells[:import_idx] + colab_cells + cells[import_idx:] + + notebook.cells = processed_cells + return notebook + + +def generate_colab_notebook(source_path: Path, output_dir: Path) -> Path: + """Generate a Colab-compatible notebook from a source file. + + Args: + source_path: Path to the jupytext percent-format Python source file + output_dir: Directory to save the output notebook + + Returns: + Path to the generated notebook + """ + # Read the source file using jupytext + notebook = jupytext.read(source_path) + + # Process the notebook for Colab + notebook = process_notebook(notebook, source_path) + + # Determine output path + output_path = output_dir / f"{source_path.stem}.ipynb" + + # Ensure output directory exists + output_dir.mkdir(parents=True, exist_ok=True) + + # Write the notebook + jupytext.write(notebook, output_path) + + return output_path + + +def main() -> None: + """Main entry point for the script.""" + parser = argparse.ArgumentParser( + description="Generate Colab-compatible notebooks from notebook source files." + ) + parser.add_argument( + "--source-dir", + type=Path, + default=Path("docs/notebook_source"), + help="Directory containing notebook source files (default: docs/notebook_source)", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path("docs/colab_notebooks"), + help="Directory to save Colab notebooks (default: docs/colab_notebooks)", + ) + parser.add_argument( + "--files", + nargs="*", + help="Specific files to process (if not specified, process all .py files)", + ) + + args = parser.parse_args() + + # Get list of source files + if args.files: + source_files = [args.source_dir / f for f in args.files] + else: + source_files = sorted(args.source_dir.glob("*.py")) + # Filter out files starting with underscore (like _README.md, _pyproject.toml) + source_files = [f for f in source_files if not f.name.startswith("_")] + + if not source_files: + print(f"No source files found in {args.source_dir}") + return + + print(f"πŸ““ Generating Colab notebooks from {len(source_files)} source file(s)...") + print(f" Source: {args.source_dir}") + print(f" Output: {args.output_dir}") + print() + + for source_path in source_files: + if not source_path.exists(): + print(f"⚠️ Skipping {source_path} (file not found)") + continue + + try: + output_path = generate_colab_notebook(source_path, args.output_dir) + print(f"βœ… {source_path.name} β†’ {output_path.name}") + except Exception as e: + print(f"❌ {source_path.name}: {e}") + + print() + print(f"✨ Colab notebooks saved to {args.output_dir}/") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index e8a488c8..49a6de9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,7 @@ docs = [ notebooks = [ "jupyter>=1.0.0", "ipykernel>=6.29.0", + "pillow>=12.0.0", ] [build-system] diff --git a/uv.lock b/uv.lock index 85e2405e..5e7cb007 100644 --- a/uv.lock +++ b/uv.lock @@ -754,6 +754,7 @@ docs = [ notebooks = [ { name = "ipykernel" }, { name = "jupyter" }, + { name = "pillow" }, ] [package.metadata] @@ -812,6 +813,7 @@ docs = [ notebooks = [ { name = "ipykernel", specifier = ">=6.29.0" }, { name = "jupyter", specifier = ">=1.0.0" }, + { name = "pillow", specifier = ">=12.0.0" }, ] [[package]] @@ -3045,6 +3047,104 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, ] +[[package]] +name = "pillow" +version = "12.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/cace85a1b0c9775a9f8f5d5423c8261c858760e2466c79b2dd184638b056/pillow-12.0.0.tar.gz", hash = "sha256:87d4f8125c9988bfbed67af47dd7a953e2fc7b0cc1e7800ec6d2080d490bb353", size = 47008828, upload-time = "2025-10-15T18:24:14.008Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/08/26e68b6b5da219c2a2cb7b563af008b53bb8e6b6fcb3fa40715fcdb2523a/pillow-12.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:3adfb466bbc544b926d50fe8f4a4e6abd8c6bffd28a26177594e6e9b2b76572b", size = 5289809, upload-time = "2025-10-15T18:21:27.791Z" }, + { url = "https://files.pythonhosted.org/packages/cb/e9/4e58fb097fb74c7b4758a680aacd558810a417d1edaa7000142976ef9d2f/pillow-12.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1ac11e8ea4f611c3c0147424eae514028b5e9077dd99ab91e1bd7bc33ff145e1", size = 4650606, upload-time = "2025-10-15T18:21:29.823Z" }, + { url = "https://files.pythonhosted.org/packages/4b/e0/1fa492aa9f77b3bc6d471c468e62bfea1823056bf7e5e4f1914d7ab2565e/pillow-12.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d49e2314c373f4c2b39446fb1a45ed333c850e09d0c59ac79b72eb3b95397363", size = 6221023, upload-time = "2025-10-15T18:21:31.415Z" }, + { url = "https://files.pythonhosted.org/packages/c1/09/4de7cd03e33734ccd0c876f0251401f1314e819cbfd89a0fcb6e77927cc6/pillow-12.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c7b2a63fd6d5246349f3d3f37b14430d73ee7e8173154461785e43036ffa96ca", size = 8024937, upload-time = "2025-10-15T18:21:33.453Z" }, + { url = "https://files.pythonhosted.org/packages/2e/69/0688e7c1390666592876d9d474f5e135abb4acb39dcb583c4dc5490f1aff/pillow-12.0.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d64317d2587c70324b79861babb9c09f71fbb780bad212018874b2c013d8600e", size = 6334139, upload-time = "2025-10-15T18:21:35.395Z" }, + { url = "https://files.pythonhosted.org/packages/ed/1c/880921e98f525b9b44ce747ad1ea8f73fd7e992bafe3ca5e5644bf433dea/pillow-12.0.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d77153e14b709fd8b8af6f66a3afbb9ed6e9fc5ccf0b6b7e1ced7b036a228782", size = 7026074, upload-time = "2025-10-15T18:21:37.219Z" }, + { url = "https://files.pythonhosted.org/packages/28/03/96f718331b19b355610ef4ebdbbde3557c726513030665071fd025745671/pillow-12.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:32ed80ea8a90ee3e6fa08c21e2e091bba6eda8eccc83dbc34c95169507a91f10", size = 6448852, upload-time = "2025-10-15T18:21:39.168Z" }, + { url = "https://files.pythonhosted.org/packages/3a/a0/6a193b3f0cc9437b122978d2c5cbce59510ccf9a5b48825096ed7472da2f/pillow-12.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c828a1ae702fc712978bda0320ba1b9893d99be0badf2647f693cc01cf0f04fa", size = 7117058, upload-time = "2025-10-15T18:21:40.997Z" }, + { url = "https://files.pythonhosted.org/packages/a7/c4/043192375eaa4463254e8e61f0e2ec9a846b983929a8d0a7122e0a6d6fff/pillow-12.0.0-cp310-cp310-win32.whl", hash = "sha256:bd87e140e45399c818fac4247880b9ce719e4783d767e030a883a970be632275", size = 6295431, upload-time = "2025-10-15T18:21:42.518Z" }, + { url = "https://files.pythonhosted.org/packages/92/c6/c2f2fc7e56301c21827e689bb8b0b465f1b52878b57471a070678c0c33cd/pillow-12.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:455247ac8a4cfb7b9bc45b7e432d10421aea9fc2e74d285ba4072688a74c2e9d", size = 7000412, upload-time = "2025-10-15T18:21:44.404Z" }, + { url = "https://files.pythonhosted.org/packages/b2/d2/5f675067ba82da7a1c238a73b32e3fd78d67f9d9f80fbadd33a40b9c0481/pillow-12.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:6ace95230bfb7cd79ef66caa064bbe2f2a1e63d93471c3a2e1f1348d9f22d6b7", size = 2435903, upload-time = "2025-10-15T18:21:46.29Z" }, + { url = "https://files.pythonhosted.org/packages/0e/5a/a2f6773b64edb921a756eb0729068acad9fc5208a53f4a349396e9436721/pillow-12.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0fd00cac9c03256c8b2ff58f162ebcd2587ad3e1f2e397eab718c47e24d231cc", size = 5289798, upload-time = "2025-10-15T18:21:47.763Z" }, + { url = "https://files.pythonhosted.org/packages/2e/05/069b1f8a2e4b5a37493da6c5868531c3f77b85e716ad7a590ef87d58730d/pillow-12.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3475b96f5908b3b16c47533daaa87380c491357d197564e0ba34ae75c0f3257", size = 4650589, upload-time = "2025-10-15T18:21:49.515Z" }, + { url = "https://files.pythonhosted.org/packages/61/e3/2c820d6e9a36432503ead175ae294f96861b07600a7156154a086ba7111a/pillow-12.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:110486b79f2d112cf6add83b28b627e369219388f64ef2f960fef9ebaf54c642", size = 6230472, upload-time = "2025-10-15T18:21:51.052Z" }, + { url = "https://files.pythonhosted.org/packages/4f/89/63427f51c64209c5e23d4d52071c8d0f21024d3a8a487737caaf614a5795/pillow-12.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5269cc1caeedb67e6f7269a42014f381f45e2e7cd42d834ede3c703a1d915fe3", size = 8033887, upload-time = "2025-10-15T18:21:52.604Z" }, + { url = "https://files.pythonhosted.org/packages/f6/1b/c9711318d4901093c15840f268ad649459cd81984c9ec9887756cca049a5/pillow-12.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa5129de4e174daccbc59d0a3b6d20eaf24417d59851c07ebb37aeb02947987c", size = 6343964, upload-time = "2025-10-15T18:21:54.619Z" }, + { url = "https://files.pythonhosted.org/packages/41/1e/db9470f2d030b4995083044cd8738cdd1bf773106819f6d8ba12597d5352/pillow-12.0.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bee2a6db3a7242ea309aa7ee8e2780726fed67ff4e5b40169f2c940e7eb09227", size = 7034756, upload-time = "2025-10-15T18:21:56.151Z" }, + { url = "https://files.pythonhosted.org/packages/cc/b0/6177a8bdd5ee4ed87cba2de5a3cc1db55ffbbec6176784ce5bb75aa96798/pillow-12.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:90387104ee8400a7b4598253b4c406f8958f59fcf983a6cea2b50d59f7d63d0b", size = 6458075, upload-time = "2025-10-15T18:21:57.759Z" }, + { url = "https://files.pythonhosted.org/packages/bc/5e/61537aa6fa977922c6a03253a0e727e6e4a72381a80d63ad8eec350684f2/pillow-12.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc91a56697869546d1b8f0a3ff35224557ae7f881050e99f615e0119bf934b4e", size = 7125955, upload-time = "2025-10-15T18:21:59.372Z" }, + { url = "https://files.pythonhosted.org/packages/1f/3d/d5033539344ee3cbd9a4d69e12e63ca3a44a739eb2d4c8da350a3d38edd7/pillow-12.0.0-cp311-cp311-win32.whl", hash = "sha256:27f95b12453d165099c84f8a8bfdfd46b9e4bda9e0e4b65f0635430027f55739", size = 6298440, upload-time = "2025-10-15T18:22:00.982Z" }, + { url = "https://files.pythonhosted.org/packages/4d/42/aaca386de5cc8bd8a0254516957c1f265e3521c91515b16e286c662854c4/pillow-12.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b583dc9070312190192631373c6c8ed277254aa6e6084b74bdd0a6d3b221608e", size = 6999256, upload-time = "2025-10-15T18:22:02.617Z" }, + { url = "https://files.pythonhosted.org/packages/ba/f1/9197c9c2d5708b785f631a6dfbfa8eb3fb9672837cb92ae9af812c13b4ed/pillow-12.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:759de84a33be3b178a64c8ba28ad5c135900359e85fb662bc6e403ad4407791d", size = 2436025, upload-time = "2025-10-15T18:22:04.598Z" }, + { url = "https://files.pythonhosted.org/packages/2c/90/4fcce2c22caf044e660a198d740e7fbc14395619e3cb1abad12192c0826c/pillow-12.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:53561a4ddc36facb432fae7a9d8afbfaf94795414f5cdc5fc52f28c1dca90371", size = 5249377, upload-time = "2025-10-15T18:22:05.993Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e0/ed960067543d080691d47d6938ebccbf3976a931c9567ab2fbfab983a5dd/pillow-12.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:71db6b4c1653045dacc1585c1b0d184004f0d7e694c7b34ac165ca70c0838082", size = 4650343, upload-time = "2025-10-15T18:22:07.718Z" }, + { url = "https://files.pythonhosted.org/packages/e7/a1/f81fdeddcb99c044bf7d6faa47e12850f13cee0849537a7d27eeab5534d4/pillow-12.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2fa5f0b6716fc88f11380b88b31fe591a06c6315e955c096c35715788b339e3f", size = 6232981, upload-time = "2025-10-15T18:22:09.287Z" }, + { url = "https://files.pythonhosted.org/packages/88/e1/9098d3ce341a8750b55b0e00c03f1630d6178f38ac191c81c97a3b047b44/pillow-12.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:82240051c6ca513c616f7f9da06e871f61bfd7805f566275841af15015b8f98d", size = 8041399, upload-time = "2025-10-15T18:22:10.872Z" }, + { url = "https://files.pythonhosted.org/packages/a7/62/a22e8d3b602ae8cc01446d0c57a54e982737f44b6f2e1e019a925143771d/pillow-12.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55f818bd74fe2f11d4d7cbc65880a843c4075e0ac7226bc1a23261dbea531953", size = 6347740, upload-time = "2025-10-15T18:22:12.769Z" }, + { url = "https://files.pythonhosted.org/packages/4f/87/424511bdcd02c8d7acf9f65caa09f291a519b16bd83c3fb3374b3d4ae951/pillow-12.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b87843e225e74576437fd5b6a4c2205d422754f84a06942cfaf1dc32243e45a8", size = 7040201, upload-time = "2025-10-15T18:22:14.813Z" }, + { url = "https://files.pythonhosted.org/packages/dc/4d/435c8ac688c54d11755aedfdd9f29c9eeddf68d150fe42d1d3dbd2365149/pillow-12.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c607c90ba67533e1b2355b821fef6764d1dd2cbe26b8c1005ae84f7aea25ff79", size = 6462334, upload-time = "2025-10-15T18:22:16.375Z" }, + { url = "https://files.pythonhosted.org/packages/2b/f2/ad34167a8059a59b8ad10bc5c72d4d9b35acc6b7c0877af8ac885b5f2044/pillow-12.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:21f241bdd5080a15bc86d3466a9f6074a9c2c2b314100dd896ac81ee6db2f1ba", size = 7134162, upload-time = "2025-10-15T18:22:17.996Z" }, + { url = "https://files.pythonhosted.org/packages/0c/b1/a7391df6adacf0a5c2cf6ac1cf1fcc1369e7d439d28f637a847f8803beb3/pillow-12.0.0-cp312-cp312-win32.whl", hash = "sha256:dd333073e0cacdc3089525c7df7d39b211bcdf31fc2824e49d01c6b6187b07d0", size = 6298769, upload-time = "2025-10-15T18:22:19.923Z" }, + { url = "https://files.pythonhosted.org/packages/a2/0b/d87733741526541c909bbf159e338dcace4f982daac6e5a8d6be225ca32d/pillow-12.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe611163f6303d1619bbcb653540a4d60f9e55e622d60a3108be0d5b441017a", size = 7001107, upload-time = "2025-10-15T18:22:21.644Z" }, + { url = "https://files.pythonhosted.org/packages/bc/96/aaa61ce33cc98421fb6088af2a03be4157b1e7e0e87087c888e2370a7f45/pillow-12.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:7dfb439562f234f7d57b1ac6bc8fe7f838a4bd49c79230e0f6a1da93e82f1fad", size = 2436012, upload-time = "2025-10-15T18:22:23.621Z" }, + { url = "https://files.pythonhosted.org/packages/62/f2/de993bb2d21b33a98d031ecf6a978e4b61da207bef02f7b43093774c480d/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:0869154a2d0546545cde61d1789a6524319fc1897d9ee31218eae7a60ccc5643", size = 4045493, upload-time = "2025-10-15T18:22:25.758Z" }, + { url = "https://files.pythonhosted.org/packages/0e/b6/bc8d0c4c9f6f111a783d045310945deb769b806d7574764234ffd50bc5ea/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:a7921c5a6d31b3d756ec980f2f47c0cfdbce0fc48c22a39347a895f41f4a6ea4", size = 4120461, upload-time = "2025-10-15T18:22:27.286Z" }, + { url = "https://files.pythonhosted.org/packages/5d/57/d60d343709366a353dc56adb4ee1e7d8a2cc34e3fbc22905f4167cfec119/pillow-12.0.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1ee80a59f6ce048ae13cda1abf7fbd2a34ab9ee7d401c46be3ca685d1999a399", size = 3576912, upload-time = "2025-10-15T18:22:28.751Z" }, + { url = "https://files.pythonhosted.org/packages/a4/a4/a0a31467e3f83b94d37568294b01d22b43ae3c5d85f2811769b9c66389dd/pillow-12.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c50f36a62a22d350c96e49ad02d0da41dbd17ddc2e29750dbdba4323f85eb4a5", size = 5249132, upload-time = "2025-10-15T18:22:30.641Z" }, + { url = "https://files.pythonhosted.org/packages/83/06/48eab21dd561de2914242711434c0c0eb992ed08ff3f6107a5f44527f5e9/pillow-12.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5193fde9a5f23c331ea26d0cf171fbf67e3f247585f50c08b3e205c7aeb4589b", size = 4650099, upload-time = "2025-10-15T18:22:32.73Z" }, + { url = "https://files.pythonhosted.org/packages/fc/bd/69ed99fd46a8dba7c1887156d3572fe4484e3f031405fcc5a92e31c04035/pillow-12.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bde737cff1a975b70652b62d626f7785e0480918dece11e8fef3c0cf057351c3", size = 6230808, upload-time = "2025-10-15T18:22:34.337Z" }, + { url = "https://files.pythonhosted.org/packages/ea/94/8fad659bcdbf86ed70099cb60ae40be6acca434bbc8c4c0d4ef356d7e0de/pillow-12.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6597ff2b61d121172f5844b53f21467f7082f5fb385a9a29c01414463f93b07", size = 8037804, upload-time = "2025-10-15T18:22:36.402Z" }, + { url = "https://files.pythonhosted.org/packages/20/39/c685d05c06deecfd4e2d1950e9a908aa2ca8bc4e6c3b12d93b9cafbd7837/pillow-12.0.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b817e7035ea7f6b942c13aa03bb554fc44fea70838ea21f8eb31c638326584e", size = 6345553, upload-time = "2025-10-15T18:22:38.066Z" }, + { url = "https://files.pythonhosted.org/packages/38/57/755dbd06530a27a5ed74f8cb0a7a44a21722ebf318edbe67ddbd7fb28f88/pillow-12.0.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f4f1231b7dec408e8670264ce63e9c71409d9583dd21d32c163e25213ee2a344", size = 7037729, upload-time = "2025-10-15T18:22:39.769Z" }, + { url = "https://files.pythonhosted.org/packages/ca/b6/7e94f4c41d238615674d06ed677c14883103dce1c52e4af16f000338cfd7/pillow-12.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e51b71417049ad6ab14c49608b4a24d8fb3fe605e5dfabfe523b58064dc3d27", size = 6459789, upload-time = "2025-10-15T18:22:41.437Z" }, + { url = "https://files.pythonhosted.org/packages/9c/14/4448bb0b5e0f22dd865290536d20ec8a23b64e2d04280b89139f09a36bb6/pillow-12.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d120c38a42c234dc9a8c5de7ceaaf899cf33561956acb4941653f8bdc657aa79", size = 7130917, upload-time = "2025-10-15T18:22:43.152Z" }, + { url = "https://files.pythonhosted.org/packages/dd/ca/16c6926cc1c015845745d5c16c9358e24282f1e588237a4c36d2b30f182f/pillow-12.0.0-cp313-cp313-win32.whl", hash = "sha256:4cc6b3b2efff105c6a1656cfe59da4fdde2cda9af1c5e0b58529b24525d0a098", size = 6302391, upload-time = "2025-10-15T18:22:44.753Z" }, + { url = "https://files.pythonhosted.org/packages/6d/2a/dd43dcfd6dae9b6a49ee28a8eedb98c7d5ff2de94a5d834565164667b97b/pillow-12.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:4cf7fed4b4580601c4345ceb5d4cbf5a980d030fd5ad07c4d2ec589f95f09905", size = 7007477, upload-time = "2025-10-15T18:22:46.838Z" }, + { url = "https://files.pythonhosted.org/packages/77/f0/72ea067f4b5ae5ead653053212af05ce3705807906ba3f3e8f58ddf617e6/pillow-12.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:9f0b04c6b8584c2c193babcccc908b38ed29524b29dd464bc8801bf10d746a3a", size = 2435918, upload-time = "2025-10-15T18:22:48.399Z" }, + { url = "https://files.pythonhosted.org/packages/f5/5e/9046b423735c21f0487ea6cb5b10f89ea8f8dfbe32576fe052b5ba9d4e5b/pillow-12.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7fa22993bac7b77b78cae22bad1e2a987ddf0d9015c63358032f84a53f23cdc3", size = 5251406, upload-time = "2025-10-15T18:22:49.905Z" }, + { url = "https://files.pythonhosted.org/packages/12/66/982ceebcdb13c97270ef7a56c3969635b4ee7cd45227fa707c94719229c5/pillow-12.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f135c702ac42262573fe9714dfe99c944b4ba307af5eb507abef1667e2cbbced", size = 4653218, upload-time = "2025-10-15T18:22:51.587Z" }, + { url = "https://files.pythonhosted.org/packages/16/b3/81e625524688c31859450119bf12674619429cab3119eec0e30a7a1029cb/pillow-12.0.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c85de1136429c524e55cfa4e033b4a7940ac5c8ee4d9401cc2d1bf48154bbc7b", size = 6266564, upload-time = "2025-10-15T18:22:53.215Z" }, + { url = "https://files.pythonhosted.org/packages/98/59/dfb38f2a41240d2408096e1a76c671d0a105a4a8471b1871c6902719450c/pillow-12.0.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:38df9b4bfd3db902c9c2bd369bcacaf9d935b2fff73709429d95cc41554f7b3d", size = 8069260, upload-time = "2025-10-15T18:22:54.933Z" }, + { url = "https://files.pythonhosted.org/packages/dc/3d/378dbea5cd1874b94c312425ca77b0f47776c78e0df2df751b820c8c1d6c/pillow-12.0.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7d87ef5795da03d742bf49439f9ca4d027cde49c82c5371ba52464aee266699a", size = 6379248, upload-time = "2025-10-15T18:22:56.605Z" }, + { url = "https://files.pythonhosted.org/packages/84/b0/d525ef47d71590f1621510327acec75ae58c721dc071b17d8d652ca494d8/pillow-12.0.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aff9e4d82d082ff9513bdd6acd4f5bd359f5b2c870907d2b0a9c5e10d40c88fe", size = 7066043, upload-time = "2025-10-15T18:22:58.53Z" }, + { url = "https://files.pythonhosted.org/packages/61/2c/aced60e9cf9d0cde341d54bf7932c9ffc33ddb4a1595798b3a5150c7ec4e/pillow-12.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8d8ca2b210ada074d57fcee40c30446c9562e542fc46aedc19baf758a93532ee", size = 6490915, upload-time = "2025-10-15T18:23:00.582Z" }, + { url = "https://files.pythonhosted.org/packages/ef/26/69dcb9b91f4e59f8f34b2332a4a0a951b44f547c4ed39d3e4dcfcff48f89/pillow-12.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:99a7f72fb6249302aa62245680754862a44179b545ded638cf1fef59befb57ef", size = 7157998, upload-time = "2025-10-15T18:23:02.627Z" }, + { url = "https://files.pythonhosted.org/packages/61/2b/726235842220ca95fa441ddf55dd2382b52ab5b8d9c0596fe6b3f23dafe8/pillow-12.0.0-cp313-cp313t-win32.whl", hash = "sha256:4078242472387600b2ce8d93ade8899c12bf33fa89e55ec89fe126e9d6d5d9e9", size = 6306201, upload-time = "2025-10-15T18:23:04.709Z" }, + { url = "https://files.pythonhosted.org/packages/c0/3d/2afaf4e840b2df71344ababf2f8edd75a705ce500e5dc1e7227808312ae1/pillow-12.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2c54c1a783d6d60595d3514f0efe9b37c8808746a66920315bfd34a938d7994b", size = 7013165, upload-time = "2025-10-15T18:23:06.46Z" }, + { url = "https://files.pythonhosted.org/packages/6f/75/3fa09aa5cf6ed04bee3fa575798ddf1ce0bace8edb47249c798077a81f7f/pillow-12.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:26d9f7d2b604cd23aba3e9faf795787456ac25634d82cd060556998e39c6fa47", size = 2437834, upload-time = "2025-10-15T18:23:08.194Z" }, + { url = "https://files.pythonhosted.org/packages/54/2a/9a8c6ba2c2c07b71bec92cf63e03370ca5e5f5c5b119b742bcc0cde3f9c5/pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:beeae3f27f62308f1ddbcfb0690bf44b10732f2ef43758f169d5e9303165d3f9", size = 4045531, upload-time = "2025-10-15T18:23:10.121Z" }, + { url = "https://files.pythonhosted.org/packages/84/54/836fdbf1bfb3d66a59f0189ff0b9f5f666cee09c6188309300df04ad71fa/pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:d4827615da15cd59784ce39d3388275ec093ae3ee8d7f0c089b76fa87af756c2", size = 4120554, upload-time = "2025-10-15T18:23:12.14Z" }, + { url = "https://files.pythonhosted.org/packages/0d/cd/16aec9f0da4793e98e6b54778a5fbce4f375c6646fe662e80600b8797379/pillow-12.0.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:3e42edad50b6909089750e65c91aa09aaf1e0a71310d383f11321b27c224ed8a", size = 3576812, upload-time = "2025-10-15T18:23:13.962Z" }, + { url = "https://files.pythonhosted.org/packages/f6/b7/13957fda356dc46339298b351cae0d327704986337c3c69bb54628c88155/pillow-12.0.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e5d8efac84c9afcb40914ab49ba063d94f5dbdf5066db4482c66a992f47a3a3b", size = 5252689, upload-time = "2025-10-15T18:23:15.562Z" }, + { url = "https://files.pythonhosted.org/packages/fc/f5/eae31a306341d8f331f43edb2e9122c7661b975433de5e447939ae61c5da/pillow-12.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:266cd5f2b63ff316d5a1bba46268e603c9caf5606d44f38c2873c380950576ad", size = 4650186, upload-time = "2025-10-15T18:23:17.379Z" }, + { url = "https://files.pythonhosted.org/packages/86/62/2a88339aa40c4c77e79108facbd307d6091e2c0eb5b8d3cf4977cfca2fe6/pillow-12.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:58eea5ebe51504057dd95c5b77d21700b77615ab0243d8152793dc00eb4faf01", size = 6230308, upload-time = "2025-10-15T18:23:18.971Z" }, + { url = "https://files.pythonhosted.org/packages/c7/33/5425a8992bcb32d1cb9fa3dd39a89e613d09a22f2c8083b7bf43c455f760/pillow-12.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f13711b1a5ba512d647a0e4ba79280d3a9a045aaf7e0cc6fbe96b91d4cdf6b0c", size = 8039222, upload-time = "2025-10-15T18:23:20.909Z" }, + { url = "https://files.pythonhosted.org/packages/d8/61/3f5d3b35c5728f37953d3eec5b5f3e77111949523bd2dd7f31a851e50690/pillow-12.0.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6846bd2d116ff42cba6b646edf5bf61d37e5cbd256425fa089fee4ff5c07a99e", size = 6346657, upload-time = "2025-10-15T18:23:23.077Z" }, + { url = "https://files.pythonhosted.org/packages/3a/be/ee90a3d79271227e0f0a33c453531efd6ed14b2e708596ba5dd9be948da3/pillow-12.0.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c98fa880d695de164b4135a52fd2e9cd7b7c90a9d8ac5e9e443a24a95ef9248e", size = 7038482, upload-time = "2025-10-15T18:23:25.005Z" }, + { url = "https://files.pythonhosted.org/packages/44/34/a16b6a4d1ad727de390e9bd9f19f5f669e079e5826ec0f329010ddea492f/pillow-12.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fa3ed2a29a9e9d2d488b4da81dcb54720ac3104a20bf0bd273f1e4648aff5af9", size = 6461416, upload-time = "2025-10-15T18:23:27.009Z" }, + { url = "https://files.pythonhosted.org/packages/b6/39/1aa5850d2ade7d7ba9f54e4e4c17077244ff7a2d9e25998c38a29749eb3f/pillow-12.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d034140032870024e6b9892c692fe2968493790dd57208b2c37e3fb35f6df3ab", size = 7131584, upload-time = "2025-10-15T18:23:29.752Z" }, + { url = "https://files.pythonhosted.org/packages/bf/db/4fae862f8fad0167073a7733973bfa955f47e2cac3dc3e3e6257d10fab4a/pillow-12.0.0-cp314-cp314-win32.whl", hash = "sha256:1b1b133e6e16105f524a8dec491e0586d072948ce15c9b914e41cdadd209052b", size = 6400621, upload-time = "2025-10-15T18:23:32.06Z" }, + { url = "https://files.pythonhosted.org/packages/2b/24/b350c31543fb0107ab2599464d7e28e6f856027aadda995022e695313d94/pillow-12.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:8dc232e39d409036af549c86f24aed8273a40ffa459981146829a324e0848b4b", size = 7142916, upload-time = "2025-10-15T18:23:34.71Z" }, + { url = "https://files.pythonhosted.org/packages/0f/9b/0ba5a6fd9351793996ef7487c4fdbde8d3f5f75dbedc093bb598648fddf0/pillow-12.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:d52610d51e265a51518692045e372a4c363056130d922a7351429ac9f27e70b0", size = 2523836, upload-time = "2025-10-15T18:23:36.967Z" }, + { url = "https://files.pythonhosted.org/packages/f5/7a/ceee0840aebc579af529b523d530840338ecf63992395842e54edc805987/pillow-12.0.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1979f4566bb96c1e50a62d9831e2ea2d1211761e5662afc545fa766f996632f6", size = 5255092, upload-time = "2025-10-15T18:23:38.573Z" }, + { url = "https://files.pythonhosted.org/packages/44/76/20776057b4bfd1aef4eeca992ebde0f53a4dce874f3ae693d0ec90a4f79b/pillow-12.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b2e4b27a6e15b04832fe9bf292b94b5ca156016bbc1ea9c2c20098a0320d6cf6", size = 4653158, upload-time = "2025-10-15T18:23:40.238Z" }, + { url = "https://files.pythonhosted.org/packages/82/3f/d9ff92ace07be8836b4e7e87e6a4c7a8318d47c2f1463ffcf121fc57d9cb/pillow-12.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fb3096c30df99fd01c7bf8e544f392103d0795b9f98ba71a8054bcbf56b255f1", size = 6267882, upload-time = "2025-10-15T18:23:42.434Z" }, + { url = "https://files.pythonhosted.org/packages/9f/7a/4f7ff87f00d3ad33ba21af78bfcd2f032107710baf8280e3722ceec28cda/pillow-12.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7438839e9e053ef79f7112c881cef684013855016f928b168b81ed5835f3e75e", size = 8071001, upload-time = "2025-10-15T18:23:44.29Z" }, + { url = "https://files.pythonhosted.org/packages/75/87/fcea108944a52dad8cca0715ae6247e271eb80459364a98518f1e4f480c1/pillow-12.0.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d5c411a8eaa2299322b647cd932586b1427367fd3184ffbb8f7a219ea2041ca", size = 6380146, upload-time = "2025-10-15T18:23:46.065Z" }, + { url = "https://files.pythonhosted.org/packages/91/52/0d31b5e571ef5fd111d2978b84603fce26aba1b6092f28e941cb46570745/pillow-12.0.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7e091d464ac59d2c7ad8e7e08105eaf9dafbc3883fd7265ffccc2baad6ac925", size = 7067344, upload-time = "2025-10-15T18:23:47.898Z" }, + { url = "https://files.pythonhosted.org/packages/7b/f4/2dd3d721f875f928d48e83bb30a434dee75a2531bca839bb996bb0aa5a91/pillow-12.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:792a2c0be4dcc18af9d4a2dfd8a11a17d5e25274a1062b0ec1c2d79c76f3e7f8", size = 6491864, upload-time = "2025-10-15T18:23:49.607Z" }, + { url = "https://files.pythonhosted.org/packages/30/4b/667dfcf3d61fc309ba5a15b141845cece5915e39b99c1ceab0f34bf1d124/pillow-12.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:afbefa430092f71a9593a99ab6a4e7538bc9eabbf7bf94f91510d3503943edc4", size = 7158911, upload-time = "2025-10-15T18:23:51.351Z" }, + { url = "https://files.pythonhosted.org/packages/a2/2f/16cabcc6426c32218ace36bf0d55955e813f2958afddbf1d391849fee9d1/pillow-12.0.0-cp314-cp314t-win32.whl", hash = "sha256:3830c769decf88f1289680a59d4f4c46c72573446352e2befec9a8512104fa52", size = 6408045, upload-time = "2025-10-15T18:23:53.177Z" }, + { url = "https://files.pythonhosted.org/packages/35/73/e29aa0c9c666cf787628d3f0dcf379f4791fba79f4936d02f8b37165bdf8/pillow-12.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:905b0365b210c73afb0ebe9101a32572152dfd1c144c7e28968a331b9217b94a", size = 7148282, upload-time = "2025-10-15T18:23:55.316Z" }, + { url = "https://files.pythonhosted.org/packages/c1/70/6b41bdcddf541b437bbb9f47f94d2db5d9ddef6c37ccab8c9107743748a4/pillow-12.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:99353a06902c2e43b43e8ff74ee65a7d90307d82370604746738a1e0661ccca7", size = 2525630, upload-time = "2025-10-15T18:23:57.149Z" }, + { url = "https://files.pythonhosted.org/packages/1d/b3/582327e6c9f86d037b63beebe981425d6811104cb443e8193824ef1a2f27/pillow-12.0.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b22bd8c974942477156be55a768f7aa37c46904c175be4e158b6a86e3a6b7ca8", size = 5215068, upload-time = "2025-10-15T18:23:59.594Z" }, + { url = "https://files.pythonhosted.org/packages/fd/d6/67748211d119f3b6540baf90f92fae73ae51d5217b171b0e8b5f7e5d558f/pillow-12.0.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:805ebf596939e48dbb2e4922a1d3852cfc25c38160751ce02da93058b48d252a", size = 4614994, upload-time = "2025-10-15T18:24:01.669Z" }, + { url = "https://files.pythonhosted.org/packages/2d/e1/f8281e5d844c41872b273b9f2c34a4bf64ca08905668c8ae730eedc7c9fa/pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cae81479f77420d217def5f54b5b9d279804d17e982e0f2fa19b1d1e14ab5197", size = 5246639, upload-time = "2025-10-15T18:24:03.403Z" }, + { url = "https://files.pythonhosted.org/packages/94/5a/0d8ab8ffe8a102ff5df60d0de5af309015163bf710c7bb3e8311dd3b3ad0/pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:aeaefa96c768fc66818730b952a862235d68825c178f1b3ffd4efd7ad2edcb7c", size = 6986839, upload-time = "2025-10-15T18:24:05.344Z" }, + { url = "https://files.pythonhosted.org/packages/20/2e/3434380e8110b76cd9eb00a363c484b050f949b4bbe84ba770bb8508a02c/pillow-12.0.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09f2d0abef9e4e2f349305a4f8cc784a8a6c2f58a8c4892eea13b10a943bd26e", size = 5313505, upload-time = "2025-10-15T18:24:07.137Z" }, + { url = "https://files.pythonhosted.org/packages/57/ca/5a9d38900d9d74785141d6580950fe705de68af735ff6e727cb911b64740/pillow-12.0.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bdee52571a343d721fb2eb3b090a82d959ff37fc631e3f70422e0c2e029f3e76", size = 5963654, upload-time = "2025-10-15T18:24:09.579Z" }, + { url = "https://files.pythonhosted.org/packages/95/7e/f896623c3c635a90537ac093c6a618ebe1a90d87206e42309cb5d98a1b9e/pillow-12.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b290fd8aa38422444d4b50d579de197557f182ef1068b75f5aa8558638b8d0a5", size = 6997850, upload-time = "2025-10-15T18:24:11.495Z" }, +] + [[package]] name = "platformdirs" version = "4.5.0" From c64b5c7b59b039301511b927882ac48dc2f71e01 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 11 Dec 2025 17:56:29 -0300 Subject: [PATCH 02/14] lint --- docs/colab_notebooks/1-the-basics.ipynb | 7 ++++--- .../2-structured-outputs-and-jinja-expressions.ipynb | 7 ++++--- docs/colab_notebooks/3-seeding-with-a-dataset.ipynb | 7 ++++--- docs/colab_notebooks/4-providing-images-as-context.ipynb | 7 ++++--- docs/scripts/generate_colab_notebooks.py | 5 +---- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index 5d4c30ad..93390b21 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -30,7 +30,7 @@ "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer \n" + "!pip install -q data-designer" ] }, { @@ -41,10 +41,11 @@ "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 8fcaec40..dce85d40 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -32,7 +32,7 @@ "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer \n" + "!pip install -q data-designer" ] }, { @@ -43,10 +43,11 @@ "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 6fbdc6dd..6db99bf8 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -32,7 +32,7 @@ "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer \n" + "!pip install -q data-designer" ] }, { @@ -43,10 +43,11 @@ "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 2f11d787..10da545a 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -41,7 +41,7 @@ "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer pillow\n" + "!pip install -q data-designer pillow" ] }, { @@ -52,10 +52,11 @@ "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py index 8aafd5e3..3136927c 100644 --- a/docs/scripts/generate_colab_notebooks.py +++ b/docs/scripts/generate_colab_notebooks.py @@ -17,7 +17,6 @@ from nbformat import NotebookNode from nbformat.v4 import new_code_cell, new_markdown_cell - IMPORT_SECTION_MARKER = "### πŸ“¦ Import the essentials" COLAB_SETUP_MARKDOWN = """\ @@ -121,9 +120,7 @@ def generate_colab_notebook(source_path: Path, output_dir: Path) -> Path: def main() -> None: """Main entry point for the script.""" - parser = argparse.ArgumentParser( - description="Generate Colab-compatible notebooks from notebook source files." - ) + parser = argparse.ArgumentParser(description="Generate Colab-compatible notebooks from notebook source files.") parser.add_argument( "--source-dir", type=Path, From 9a888395542d0a328f284df1e33931851947b8de Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 11 Dec 2025 18:23:10 -0300 Subject: [PATCH 03/14] fixes --- .github/workflows/check-colab-notebooks.yml | 13 +++- docs/colab_notebooks/1-the-basics.ipynb | 69 +++++++++-------- ...ctured-outputs-and-jinja-expressions.ipynb | 65 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 61 ++++++++------- .../4-providing-images-as-context.ipynb | 75 +++++++++---------- docs/scripts/generate_colab_notebooks.py | 2 +- 6 files changed, 144 insertions(+), 141 deletions(-) diff --git a/.github/workflows/check-colab-notebooks.yml b/.github/workflows/check-colab-notebooks.yml index bb64ecc1..3d1d35a1 100644 --- a/.github/workflows/check-colab-notebooks.yml +++ b/.github/workflows/check-colab-notebooks.yml @@ -29,7 +29,7 @@ jobs: - name: Install dependencies run: | - uv sync --group notebooks + uv sync --group notebooks --group docs - name: Generate Colab notebooks run: | @@ -37,12 +37,19 @@ jobs: - name: Check for differences run: | - if git diff --exit-code docs/colab_notebooks/; then - echo "βœ… Colab notebooks are up-to-date" + # Get the diff, filtering out cell ID changes (which are randomly generated) + # Filter out: diff headers, file markers, hunk markers, and "id" lines + MEANINGFUL_DIFF=$(git diff docs/colab_notebooks/ | grep -E '^[+-]' | grep -v '^[+-]{3}' | grep -vE '^[+-]\s*"id": "[0-9a-fA-F]+",?$' || true) + + if [ -z "$MEANINGFUL_DIFF" ]; then + echo "βœ… Colab notebooks are up-to-date (ignoring cell ID changes)" else echo "❌ Colab notebooks are out of sync with source files" echo "" echo "The generated notebooks differ from the committed ones." echo "Please run 'make generate-colab-notebooks' locally and commit the changes." + echo "" + echo "Differences found:" + echo "$MEANINGFUL_DIFF" exit 1 fi diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index 93390b21..6d86cbc2 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "9da2fbbf", + "id": "2ca47def", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "53cc83fe", + "id": "1c07d4d1", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -25,32 +25,31 @@ { "cell_type": "code", "execution_count": null, - "id": "0ec209f1", + "id": "e6fdbaff", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer" + "!pip install -q data-designer \n" ] }, { "cell_type": "code", "execution_count": null, - "id": "636b7151", + "id": "54f7eb1e", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", - "import os\n", - "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" ] }, { "cell_type": "markdown", - "id": "8572f99c", + "id": "04c89e08", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -61,7 +60,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6b80a6d6", + "id": "59f952e2", "metadata": {}, "outputs": [], "source": [ @@ -82,7 +81,7 @@ }, { "cell_type": "markdown", - "id": "a07f86e7", + "id": "f5c741e9", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -95,7 +94,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9b6f8dd", + "id": "2dccabca", "metadata": {}, "outputs": [], "source": [ @@ -104,7 +103,7 @@ }, { "cell_type": "markdown", - "id": "0e264f8b", + "id": "b872bf95", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -121,7 +120,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0982db6a", + "id": "8f40a46d", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +152,7 @@ }, { "cell_type": "markdown", - "id": "e39ae0f0", + "id": "81360ecf", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -168,7 +167,7 @@ { "cell_type": "code", "execution_count": null, - "id": "845a2ce6", + "id": "992ee1bd", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +176,7 @@ }, { "cell_type": "markdown", - "id": "3c829214", + "id": "0dbcf838", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -194,7 +193,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3afafa39", + "id": "bc9c46f1", "metadata": {}, "outputs": [], "source": [ @@ -203,7 +202,7 @@ }, { "cell_type": "markdown", - "id": "d774fe6b", + "id": "bbcbb073", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -212,7 +211,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eb8ff044", + "id": "56983af7", "metadata": {}, "outputs": [], "source": [ @@ -293,7 +292,7 @@ }, { "cell_type": "markdown", - "id": "6851c834", + "id": "275351cc", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -302,7 +301,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d4832cab", + "id": "6897388a", "metadata": {}, "outputs": [], "source": [ @@ -339,7 +338,7 @@ }, { "cell_type": "markdown", - "id": "205cd3e4", + "id": "b00a82d2", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -354,7 +353,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d140be8a", + "id": "8acf067e", "metadata": {}, "outputs": [], "source": [ @@ -391,7 +390,7 @@ }, { "cell_type": "markdown", - "id": "543509ef", + "id": "34764875", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -408,7 +407,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d2a8ed5", + "id": "04a01435", "metadata": {}, "outputs": [], "source": [ @@ -418,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dca570cd", + "id": "42ca5d3f", "metadata": {}, "outputs": [], "source": [ @@ -429,7 +428,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8e816243", + "id": "be9316ed", "metadata": {}, "outputs": [], "source": [ @@ -439,7 +438,7 @@ }, { "cell_type": "markdown", - "id": "03b437c0", + "id": "ed536176", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -452,7 +451,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7aface55", + "id": "c763bc92", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +461,7 @@ }, { "cell_type": "markdown", - "id": "82af5b0f", + "id": "bfe254fd", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -475,7 +474,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32e856ad", + "id": "9c496f62", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +484,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8a9d6d09", + "id": "a5969af9", "metadata": {}, "outputs": [], "source": [ @@ -498,7 +497,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9955c7ea", + "id": "adc4724e", "metadata": {}, "outputs": [], "source": [ @@ -510,7 +509,7 @@ }, { "cell_type": "markdown", - "id": "6c233d3d", + "id": "05172306", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index dce85d40..e5474570 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "86ed3c32", + "id": "581f68ef", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "d4ed5a85", + "id": "59f6ac60", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,32 +27,31 @@ { "cell_type": "code", "execution_count": null, - "id": "7d7db456", + "id": "469f79f8", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer" + "!pip install -q data-designer \n" ] }, { "cell_type": "code", "execution_count": null, - "id": "5bd0ce7f", + "id": "080d0605", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", - "import os\n", - "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" ] }, { "cell_type": "markdown", - "id": "b3c71fba", + "id": "74a9ecdc", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +62,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f6a3cbe2", + "id": "85778bb4", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "9678504c", + "id": "b60804b9", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +96,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b583355", + "id": "f695a80c", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +105,7 @@ }, { "cell_type": "markdown", - "id": "a2510497", + "id": "b40cb3e3", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -123,7 +122,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1616b279", + "id": "1d968e2c", "metadata": {}, "outputs": [], "source": [ @@ -155,7 +154,7 @@ }, { "cell_type": "markdown", - "id": "ea5fca4a", + "id": "8c067ce3", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -170,7 +169,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b56cacd", + "id": "8dd6656a", "metadata": {}, "outputs": [], "source": [ @@ -179,7 +178,7 @@ }, { "cell_type": "markdown", - "id": "f7823592", + "id": "d7c45860", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -206,7 +205,7 @@ { "cell_type": "code", "execution_count": null, - "id": "395a420c", + "id": "d94bdbac", "metadata": {}, "outputs": [], "source": [ @@ -234,7 +233,7 @@ }, { "cell_type": "markdown", - "id": "20f34cc8", + "id": "9c659516", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -243,7 +242,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bdc1f037", + "id": "035cdb74", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +351,7 @@ }, { "cell_type": "markdown", - "id": "ae1ff751", + "id": "ba758579", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -369,7 +368,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ccd831aa", + "id": "65a26ff9", "metadata": {}, "outputs": [], "source": [ @@ -423,7 +422,7 @@ }, { "cell_type": "markdown", - "id": "f9efe1e4", + "id": "6d35aaba", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -440,7 +439,7 @@ { "cell_type": "code", "execution_count": null, - "id": "054ff544", + "id": "865ef8c6", "metadata": {}, "outputs": [], "source": [ @@ -450,7 +449,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e33f6cba", + "id": "a5f3bdf3", "metadata": {}, "outputs": [], "source": [ @@ -461,7 +460,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f896dbac", + "id": "a4a90d96", "metadata": {}, "outputs": [], "source": [ @@ -471,7 +470,7 @@ }, { "cell_type": "markdown", - "id": "85e72e6e", + "id": "a517c11c", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -484,7 +483,7 @@ { "cell_type": "code", "execution_count": null, - "id": "378d027e", + "id": "8944d6cd", "metadata": {}, "outputs": [], "source": [ @@ -494,7 +493,7 @@ }, { "cell_type": "markdown", - "id": "c1442a6d", + "id": "f49329ea", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -507,7 +506,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aee71736", + "id": "08d7aff4", "metadata": {}, "outputs": [], "source": [ @@ -517,7 +516,7 @@ { "cell_type": "code", "execution_count": null, - "id": "71364f16", + "id": "3094d4cd", "metadata": {}, "outputs": [], "source": [ @@ -530,7 +529,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6a57dff2", + "id": "ee48df3c", "metadata": {}, "outputs": [], "source": [ @@ -542,7 +541,7 @@ }, { "cell_type": "markdown", - "id": "bd540227", + "id": "2a7356d7", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 6db99bf8..50693058 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "3fec7172", + "id": "fef37b50", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "3a00d147", + "id": "6a35705b", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,32 +27,31 @@ { "cell_type": "code", "execution_count": null, - "id": "6f02cd4e", + "id": "7cb8d485", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer" + "!pip install -q data-designer \n" ] }, { "cell_type": "code", "execution_count": null, - "id": "b77a2c6e", + "id": "04f04275", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", - "import os\n", - "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" ] }, { "cell_type": "markdown", - "id": "568cebb4", + "id": "3a43e9b2", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +62,7 @@ { "cell_type": "code", "execution_count": null, - "id": "240cc94a", + "id": "bdc39109", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +77,7 @@ }, { "cell_type": "markdown", - "id": "ae03b63c", + "id": "f53729bb", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +90,7 @@ { "cell_type": "code", "execution_count": null, - "id": "56e14153", + "id": "4a59a0f4", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +99,7 @@ }, { "cell_type": "markdown", - "id": "8edc3dd9", + "id": "95d5084d", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "843b1bf7", + "id": "87d40f27", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +148,7 @@ }, { "cell_type": "markdown", - "id": "08cffbb3", + "id": "19c68b1d", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -164,7 +163,7 @@ { "cell_type": "code", "execution_count": null, - "id": "76116f51", + "id": "524a6cf7", "metadata": {}, "outputs": [], "source": [ @@ -173,7 +172,7 @@ }, { "cell_type": "markdown", - "id": "99b3b740", + "id": "2660d181", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -198,7 +197,7 @@ { "cell_type": "code", "execution_count": null, - "id": "616811de", + "id": "8f494042", "metadata": {}, "outputs": [], "source": [ @@ -216,7 +215,7 @@ }, { "cell_type": "markdown", - "id": "71f13ab9", + "id": "38072683", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -233,7 +232,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d2e7438", + "id": "27865737", "metadata": {}, "outputs": [], "source": [ @@ -323,7 +322,7 @@ }, { "cell_type": "markdown", - "id": "cb4eb347", + "id": "5a23deb5", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -340,7 +339,7 @@ { "cell_type": "code", "execution_count": null, - "id": "55ff2e7f", + "id": "ca24bc09", "metadata": {}, "outputs": [], "source": [ @@ -350,7 +349,7 @@ { "cell_type": "code", "execution_count": null, - "id": "25ab9d95", + "id": "b5a965ed", "metadata": {}, "outputs": [], "source": [ @@ -361,7 +360,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b1e9b3a6", + "id": "425b63fa", "metadata": {}, "outputs": [], "source": [ @@ -371,7 +370,7 @@ }, { "cell_type": "markdown", - "id": "da16ff5a", + "id": "c6b326af", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -384,7 +383,7 @@ { "cell_type": "code", "execution_count": null, - "id": "82a12ed7", + "id": "0fdf5ec8", "metadata": {}, "outputs": [], "source": [ @@ -394,7 +393,7 @@ }, { "cell_type": "markdown", - "id": "297eb1aa", + "id": "b510cc12", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -407,7 +406,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47c20a9d", + "id": "a1f86e26", "metadata": {}, "outputs": [], "source": [ @@ -417,7 +416,7 @@ { "cell_type": "code", "execution_count": null, - "id": "180e9cc2", + "id": "5e687b16", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +429,7 @@ { "cell_type": "code", "execution_count": null, - "id": "61a840d6", + "id": "550f90f1", "metadata": {}, "outputs": [], "source": [ @@ -442,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "8bfdafde", + "id": "2b4b28a5", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 10da545a..ae641472 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a7a02425", + "id": "b4dc9a5b", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "048244cc", + "id": "412786ca", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "d87b9fb3", + "id": "56484cd1", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -36,32 +36,31 @@ { "cell_type": "code", "execution_count": null, - "id": "6b04c6f2", + "id": "1138042c", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer pillow" + "!pip install -q data-designer pillow\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "589de42d", + "id": "24fe27c9", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", - "import os\n", - "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" ] }, { "cell_type": "markdown", - "id": "65a793d7", + "id": "9157cc75", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -72,7 +71,7 @@ { "cell_type": "code", "execution_count": null, - "id": "df9164a4", + "id": "1ad5b4e0", "metadata": {}, "outputs": [], "source": [ @@ -103,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "8a9774fe", + "id": "04e25ed4", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -116,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5bf4c5f8", + "id": "195fc878", "metadata": {}, "outputs": [], "source": [ @@ -125,7 +124,7 @@ }, { "cell_type": "markdown", - "id": "ce1c435c", + "id": "0bb11458", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -142,7 +141,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f966056", + "id": "12dcb08d", "metadata": {}, "outputs": [], "source": [ @@ -165,7 +164,7 @@ }, { "cell_type": "markdown", - "id": "bc91ea68", + "id": "91a381d1", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -180,7 +179,7 @@ { "cell_type": "code", "execution_count": null, - "id": "72ae01dc", + "id": "df89f918", "metadata": {}, "outputs": [], "source": [ @@ -189,7 +188,7 @@ }, { "cell_type": "markdown", - "id": "1f05f2d3", + "id": "9c5aae47", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -206,7 +205,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9cd6c722", + "id": "b086e19d", "metadata": {}, "outputs": [], "source": [ @@ -221,7 +220,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b5bedd8e", + "id": "11c74943", "metadata": {}, "outputs": [], "source": [ @@ -269,7 +268,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7fef568a", + "id": "dbe25614", "metadata": {}, "outputs": [], "source": [ @@ -287,7 +286,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8293659d", + "id": "1118f9f5", "metadata": {}, "outputs": [], "source": [ @@ -297,7 +296,7 @@ { "cell_type": "code", "execution_count": null, - "id": "515ef6c8", + "id": "9003a5f8", "metadata": {}, "outputs": [], "source": [ @@ -311,7 +310,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4f55543b", + "id": "02b1081b", "metadata": { "lines_to_next_cell": 2 }, @@ -340,7 +339,7 @@ }, { "cell_type": "markdown", - "id": "3504fd38", + "id": "a27d5be9", "metadata": { "lines_to_next_cell": 2 }, @@ -348,7 +347,7 @@ }, { "cell_type": "markdown", - "id": "f7806a5d", + "id": "e235758f", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -365,7 +364,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c4172df5", + "id": "16c5296a", "metadata": {}, "outputs": [], "source": [ @@ -375,7 +374,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bd1fc8bf", + "id": "fe76f3e3", "metadata": {}, "outputs": [], "source": [ @@ -386,7 +385,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d106f727", + "id": "7c8eb9e6", "metadata": {}, "outputs": [], "source": [ @@ -396,7 +395,7 @@ }, { "cell_type": "markdown", - "id": "f6f83fb6", + "id": "3d6ca462", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -409,7 +408,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d69be80", + "id": "1510a8be", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +418,7 @@ }, { "cell_type": "markdown", - "id": "b995dce4", + "id": "942c22b2", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -430,7 +429,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c157042b", + "id": "5bb55fc9", "metadata": { "lines_to_next_cell": 2 }, @@ -454,7 +453,7 @@ }, { "cell_type": "markdown", - "id": "9f29d990", + "id": "ab1f598c", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -467,7 +466,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2625415e", + "id": "d81afa35", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "595f64fa", + "id": "05929961", "metadata": {}, "outputs": [], "source": [ @@ -490,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "422a375d", + "id": "102ae3ac", "metadata": {}, "outputs": [], "source": [ @@ -502,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "8880d71e", + "id": "82453acd", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py index 3136927c..b7dd8f42 100644 --- a/docs/scripts/generate_colab_notebooks.py +++ b/docs/scripts/generate_colab_notebooks.py @@ -113,7 +113,7 @@ def generate_colab_notebook(source_path: Path, output_dir: Path) -> Path: output_dir.mkdir(parents=True, exist_ok=True) # Write the notebook - jupytext.write(notebook, output_path) + jupytext.write(notebook, output_path, config={"metadata": {"jupytext": {"cell_metadata_filter": "-id"}}}) return output_path From 6cfcc19568cb0316a44e29339cb9333228343bca Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 11 Dec 2025 18:27:21 -0300 Subject: [PATCH 04/14] lint again --- docs/colab_notebooks/1-the-basics.ipynb | 7 ++++--- .../2-structured-outputs-and-jinja-expressions.ipynb | 7 ++++--- docs/colab_notebooks/3-seeding-with-a-dataset.ipynb | 7 ++++--- docs/colab_notebooks/4-providing-images-as-context.ipynb | 7 ++++--- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index 6d86cbc2..98f91ead 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -30,7 +30,7 @@ "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer \n" + "!pip install -q data-designer" ] }, { @@ -41,10 +41,11 @@ "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index e5474570..8a9904df 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -32,7 +32,7 @@ "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer \n" + "!pip install -q data-designer" ] }, { @@ -43,10 +43,11 @@ "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 50693058..bd3a1856 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -32,7 +32,7 @@ "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer \n" + "!pip install -q data-designer" ] }, { @@ -43,10 +43,11 @@ "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index ae641472..58265edb 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -41,7 +41,7 @@ "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer pillow\n" + "!pip install -q data-designer pillow" ] }, { @@ -52,10 +52,11 @@ "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { From aa33a562dffc88f962c0027e1bef215bc1a1879c Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 11 Dec 2025 18:29:36 -0300 Subject: [PATCH 05/14] other changes apparently --- docs/colab_notebooks/1-the-basics.ipynb | 69 +++++++++-------- ...ctured-outputs-and-jinja-expressions.ipynb | 65 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 61 ++++++++------- .../4-providing-images-as-context.ipynb | 75 +++++++++---------- 4 files changed, 133 insertions(+), 137 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index 98f91ead..e11fc53e 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "2ca47def", + "id": "566cf967", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "1c07d4d1", + "id": "6dc9e12e", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -25,32 +25,31 @@ { "cell_type": "code", "execution_count": null, - "id": "e6fdbaff", + "id": "965af885", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer" + "!pip install -q data-designer \n" ] }, { "cell_type": "code", "execution_count": null, - "id": "54f7eb1e", + "id": "48b4b70f", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", - "import os\n", - "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" ] }, { "cell_type": "markdown", - "id": "04c89e08", + "id": "995ab7ce", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -61,7 +60,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59f952e2", + "id": "4fa3d7be", "metadata": {}, "outputs": [], "source": [ @@ -82,7 +81,7 @@ }, { "cell_type": "markdown", - "id": "f5c741e9", + "id": "a05509e3", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -95,7 +94,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2dccabca", + "id": "62b77ee5", "metadata": {}, "outputs": [], "source": [ @@ -104,7 +103,7 @@ }, { "cell_type": "markdown", - "id": "b872bf95", + "id": "2cc36c72", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -121,7 +120,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8f40a46d", + "id": "05f87a82", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +152,7 @@ }, { "cell_type": "markdown", - "id": "81360ecf", + "id": "d6b44559", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -168,7 +167,7 @@ { "cell_type": "code", "execution_count": null, - "id": "992ee1bd", + "id": "2d5e514b", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +176,7 @@ }, { "cell_type": "markdown", - "id": "0dbcf838", + "id": "1bcb48b3", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -194,7 +193,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bc9c46f1", + "id": "9bd92341", "metadata": {}, "outputs": [], "source": [ @@ -203,7 +202,7 @@ }, { "cell_type": "markdown", - "id": "bbcbb073", + "id": "f5c6ed39", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -212,7 +211,7 @@ { "cell_type": "code", "execution_count": null, - "id": "56983af7", + "id": "b9b666af", "metadata": {}, "outputs": [], "source": [ @@ -293,7 +292,7 @@ }, { "cell_type": "markdown", - "id": "275351cc", + "id": "da175b7b", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -302,7 +301,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6897388a", + "id": "98f1ac50", "metadata": {}, "outputs": [], "source": [ @@ -339,7 +338,7 @@ }, { "cell_type": "markdown", - "id": "b00a82d2", + "id": "12bac21a", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -354,7 +353,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8acf067e", + "id": "b413ed5a", "metadata": {}, "outputs": [], "source": [ @@ -391,7 +390,7 @@ }, { "cell_type": "markdown", - "id": "34764875", + "id": "98e77924", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -408,7 +407,7 @@ { "cell_type": "code", "execution_count": null, - "id": "04a01435", + "id": "876ba0f2", "metadata": {}, "outputs": [], "source": [ @@ -418,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "42ca5d3f", + "id": "2af68741", "metadata": {}, "outputs": [], "source": [ @@ -429,7 +428,7 @@ { "cell_type": "code", "execution_count": null, - "id": "be9316ed", + "id": "5916c3cb", "metadata": {}, "outputs": [], "source": [ @@ -439,7 +438,7 @@ }, { "cell_type": "markdown", - "id": "ed536176", + "id": "ac6d7448", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -452,7 +451,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c763bc92", + "id": "cf494d0a", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +461,7 @@ }, { "cell_type": "markdown", - "id": "bfe254fd", + "id": "357b2c86", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -475,7 +474,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c496f62", + "id": "84bd7549", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +484,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a5969af9", + "id": "d266d2a1", "metadata": {}, "outputs": [], "source": [ @@ -498,7 +497,7 @@ { "cell_type": "code", "execution_count": null, - "id": "adc4724e", + "id": "6328de8f", "metadata": {}, "outputs": [], "source": [ @@ -510,7 +509,7 @@ }, { "cell_type": "markdown", - "id": "05172306", + "id": "1b81038b", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 8a9904df..cbf1dcf0 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "581f68ef", + "id": "26826211", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "59f6ac60", + "id": "832b88b4", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,32 +27,31 @@ { "cell_type": "code", "execution_count": null, - "id": "469f79f8", + "id": "969bbbcc", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer" + "!pip install -q data-designer \n" ] }, { "cell_type": "code", "execution_count": null, - "id": "080d0605", + "id": "41748474", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", - "import os\n", - "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" ] }, { "cell_type": "markdown", - "id": "74a9ecdc", + "id": "4e8014d8", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +62,7 @@ { "cell_type": "code", "execution_count": null, - "id": "85778bb4", + "id": "7352bbf0", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +83,7 @@ }, { "cell_type": "markdown", - "id": "b60804b9", + "id": "14963f9f", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +96,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f695a80c", + "id": "eb618d54", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +105,7 @@ }, { "cell_type": "markdown", - "id": "b40cb3e3", + "id": "caa3f57f", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -123,7 +122,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1d968e2c", + "id": "5549152c", "metadata": {}, "outputs": [], "source": [ @@ -155,7 +154,7 @@ }, { "cell_type": "markdown", - "id": "8c067ce3", + "id": "e7c5d645", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -170,7 +169,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8dd6656a", + "id": "6ac98165", "metadata": {}, "outputs": [], "source": [ @@ -179,7 +178,7 @@ }, { "cell_type": "markdown", - "id": "d7c45860", + "id": "bb99873c", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -206,7 +205,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d94bdbac", + "id": "a1619f2e", "metadata": {}, "outputs": [], "source": [ @@ -234,7 +233,7 @@ }, { "cell_type": "markdown", - "id": "9c659516", + "id": "8895a7f0", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -243,7 +242,7 @@ { "cell_type": "code", "execution_count": null, - "id": "035cdb74", + "id": "af32e36e", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +351,7 @@ }, { "cell_type": "markdown", - "id": "ba758579", + "id": "b8241c06", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -369,7 +368,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65a26ff9", + "id": "ba29a63d", "metadata": {}, "outputs": [], "source": [ @@ -423,7 +422,7 @@ }, { "cell_type": "markdown", - "id": "6d35aaba", + "id": "4f733458", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -440,7 +439,7 @@ { "cell_type": "code", "execution_count": null, - "id": "865ef8c6", + "id": "c5efd286", "metadata": {}, "outputs": [], "source": [ @@ -450,7 +449,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a5f3bdf3", + "id": "5245e40f", "metadata": {}, "outputs": [], "source": [ @@ -461,7 +460,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a4a90d96", + "id": "07e46b28", "metadata": {}, "outputs": [], "source": [ @@ -471,7 +470,7 @@ }, { "cell_type": "markdown", - "id": "a517c11c", + "id": "6388cffb", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -484,7 +483,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8944d6cd", + "id": "4af08e47", "metadata": {}, "outputs": [], "source": [ @@ -494,7 +493,7 @@ }, { "cell_type": "markdown", - "id": "f49329ea", + "id": "24884cc3", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -507,7 +506,7 @@ { "cell_type": "code", "execution_count": null, - "id": "08d7aff4", + "id": "03f2d0e2", "metadata": {}, "outputs": [], "source": [ @@ -517,7 +516,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3094d4cd", + "id": "98db8bc6", "metadata": {}, "outputs": [], "source": [ @@ -530,7 +529,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ee48df3c", + "id": "0234d630", "metadata": {}, "outputs": [], "source": [ @@ -542,7 +541,7 @@ }, { "cell_type": "markdown", - "id": "2a7356d7", + "id": "e4f790c7", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index bd3a1856..3b1ab4fa 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "fef37b50", + "id": "b52e3e14", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "6a35705b", + "id": "e8bdd897", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,32 +27,31 @@ { "cell_type": "code", "execution_count": null, - "id": "7cb8d485", + "id": "9f8f43fd", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer" + "!pip install -q data-designer \n" ] }, { "cell_type": "code", "execution_count": null, - "id": "04f04275", + "id": "5cd30df0", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", - "import os\n", - "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" ] }, { "cell_type": "markdown", - "id": "3a43e9b2", + "id": "19140f29", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +62,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bdc39109", + "id": "5111abc3", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +77,7 @@ }, { "cell_type": "markdown", - "id": "f53729bb", + "id": "7c05285e", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +90,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4a59a0f4", + "id": "a6d36f9f", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +99,7 @@ }, { "cell_type": "markdown", - "id": "95d5084d", + "id": "737515ef", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "87d40f27", + "id": "ff4d27e7", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +148,7 @@ }, { "cell_type": "markdown", - "id": "19c68b1d", + "id": "05d3f7b5", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -164,7 +163,7 @@ { "cell_type": "code", "execution_count": null, - "id": "524a6cf7", + "id": "59a6cf53", "metadata": {}, "outputs": [], "source": [ @@ -173,7 +172,7 @@ }, { "cell_type": "markdown", - "id": "2660d181", + "id": "0698182b", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -198,7 +197,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8f494042", + "id": "623fb7fa", "metadata": {}, "outputs": [], "source": [ @@ -216,7 +215,7 @@ }, { "cell_type": "markdown", - "id": "38072683", + "id": "24dcb1b2", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -233,7 +232,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27865737", + "id": "773fb7e4", "metadata": {}, "outputs": [], "source": [ @@ -323,7 +322,7 @@ }, { "cell_type": "markdown", - "id": "5a23deb5", + "id": "86c89239", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -340,7 +339,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca24bc09", + "id": "47cdc500", "metadata": {}, "outputs": [], "source": [ @@ -350,7 +349,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b5a965ed", + "id": "b06eface", "metadata": {}, "outputs": [], "source": [ @@ -361,7 +360,7 @@ { "cell_type": "code", "execution_count": null, - "id": "425b63fa", + "id": "e5cafa10", "metadata": {}, "outputs": [], "source": [ @@ -371,7 +370,7 @@ }, { "cell_type": "markdown", - "id": "c6b326af", + "id": "ef9e55d7", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -384,7 +383,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0fdf5ec8", + "id": "993cd27f", "metadata": {}, "outputs": [], "source": [ @@ -394,7 +393,7 @@ }, { "cell_type": "markdown", - "id": "b510cc12", + "id": "960e2c6c", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -407,7 +406,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a1f86e26", + "id": "1bd696b5", "metadata": {}, "outputs": [], "source": [ @@ -417,7 +416,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5e687b16", + "id": "7b1be27a", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +429,7 @@ { "cell_type": "code", "execution_count": null, - "id": "550f90f1", + "id": "4101f6a2", "metadata": {}, "outputs": [], "source": [ @@ -442,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "2b4b28a5", + "id": "857a0ded", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 58265edb..42bbac67 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "b4dc9a5b", + "id": "1e86fc40", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "412786ca", + "id": "c59cb50b", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "56484cd1", + "id": "109f3eed", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -36,32 +36,31 @@ { "cell_type": "code", "execution_count": null, - "id": "1138042c", + "id": "6aa21eed", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer pillow" + "!pip install -q data-designer pillow\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "24fe27c9", + "id": "dd12beef", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", - "import os\n", - "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "import os\n", + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" ] }, { "cell_type": "markdown", - "id": "9157cc75", + "id": "c4162d82", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -72,7 +71,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1ad5b4e0", + "id": "ecec234b", "metadata": {}, "outputs": [], "source": [ @@ -103,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "04e25ed4", + "id": "504f1d40", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -116,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "195fc878", + "id": "e59c8ab0", "metadata": {}, "outputs": [], "source": [ @@ -125,7 +124,7 @@ }, { "cell_type": "markdown", - "id": "0bb11458", + "id": "71bb9093", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -142,7 +141,7 @@ { "cell_type": "code", "execution_count": null, - "id": "12dcb08d", + "id": "dfceb4f8", "metadata": {}, "outputs": [], "source": [ @@ -165,7 +164,7 @@ }, { "cell_type": "markdown", - "id": "91a381d1", + "id": "08426a33", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -180,7 +179,7 @@ { "cell_type": "code", "execution_count": null, - "id": "df89f918", + "id": "afad97d0", "metadata": {}, "outputs": [], "source": [ @@ -189,7 +188,7 @@ }, { "cell_type": "markdown", - "id": "9c5aae47", + "id": "a4e86661", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -206,7 +205,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b086e19d", + "id": "b906647b", "metadata": {}, "outputs": [], "source": [ @@ -221,7 +220,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11c74943", + "id": "0e2827d8", "metadata": {}, "outputs": [], "source": [ @@ -269,7 +268,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dbe25614", + "id": "aff5bddf", "metadata": {}, "outputs": [], "source": [ @@ -287,7 +286,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1118f9f5", + "id": "b46db348", "metadata": {}, "outputs": [], "source": [ @@ -297,7 +296,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9003a5f8", + "id": "93014a2a", "metadata": {}, "outputs": [], "source": [ @@ -311,7 +310,7 @@ { "cell_type": "code", "execution_count": null, - "id": "02b1081b", + "id": "b5d75ac8", "metadata": { "lines_to_next_cell": 2 }, @@ -340,7 +339,7 @@ }, { "cell_type": "markdown", - "id": "a27d5be9", + "id": "0bf45f8a", "metadata": { "lines_to_next_cell": 2 }, @@ -348,7 +347,7 @@ }, { "cell_type": "markdown", - "id": "e235758f", + "id": "ce1f9b8f", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -365,7 +364,7 @@ { "cell_type": "code", "execution_count": null, - "id": "16c5296a", + "id": "eb4ebb7e", "metadata": {}, "outputs": [], "source": [ @@ -375,7 +374,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe76f3e3", + "id": "28bc690a", "metadata": {}, "outputs": [], "source": [ @@ -386,7 +385,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c8eb9e6", + "id": "572bdc4c", "metadata": {}, "outputs": [], "source": [ @@ -396,7 +395,7 @@ }, { "cell_type": "markdown", - "id": "3d6ca462", + "id": "35a24fdd", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -409,7 +408,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1510a8be", + "id": "e3ec9f06", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +418,7 @@ }, { "cell_type": "markdown", - "id": "942c22b2", + "id": "10a548d1", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -430,7 +429,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5bb55fc9", + "id": "b0dfa425", "metadata": { "lines_to_next_cell": 2 }, @@ -454,7 +453,7 @@ }, { "cell_type": "markdown", - "id": "ab1f598c", + "id": "b843e820", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -467,7 +466,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d81afa35", + "id": "6856fa9f", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "05929961", + "id": "8d656659", "metadata": {}, "outputs": [], "source": [ @@ -490,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "102ae3ac", + "id": "9dfa4c11", "metadata": {}, "outputs": [], "source": [ @@ -502,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "82453acd", + "id": "15fcf9f1", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", From 006904b193dfd6aa88f53b87d1f219b9ea4fc34a Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 11 Dec 2025 18:41:12 -0300 Subject: [PATCH 06/14] injected blocks need to pass ruff --- docs/colab_notebooks/1-the-basics.ipynb | 69 ++++++++--------- ...ctured-outputs-and-jinja-expressions.ipynb | 65 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 61 +++++++-------- .../4-providing-images-as-context.ipynb | 75 ++++++++++--------- docs/scripts/generate_colab_notebooks.py | 11 ++- 5 files changed, 142 insertions(+), 139 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index e11fc53e..356cbc88 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "566cf967", + "id": "4b31b914", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "6dc9e12e", + "id": "591c96b6", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -25,31 +25,32 @@ { "cell_type": "code", "execution_count": null, - "id": "965af885", + "id": "8e03a646", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer \n" + "!pip install -q data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "48b4b70f", + "id": "5fa2627e", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { "cell_type": "markdown", - "id": "995ab7ce", + "id": "7142c668", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -60,7 +61,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4fa3d7be", + "id": "4d314046", "metadata": {}, "outputs": [], "source": [ @@ -81,7 +82,7 @@ }, { "cell_type": "markdown", - "id": "a05509e3", + "id": "2c2c91a1", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -94,7 +95,7 @@ { "cell_type": "code", "execution_count": null, - "id": "62b77ee5", + "id": "c2b97c7d", "metadata": {}, "outputs": [], "source": [ @@ -103,7 +104,7 @@ }, { "cell_type": "markdown", - "id": "2cc36c72", + "id": "385067da", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -120,7 +121,7 @@ { "cell_type": "code", "execution_count": null, - "id": "05f87a82", + "id": "808602fc", "metadata": {}, "outputs": [], "source": [ @@ -152,7 +153,7 @@ }, { "cell_type": "markdown", - "id": "d6b44559", + "id": "7779673f", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -167,7 +168,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2d5e514b", + "id": "4a16bb2c", "metadata": {}, "outputs": [], "source": [ @@ -176,7 +177,7 @@ }, { "cell_type": "markdown", - "id": "1bcb48b3", + "id": "e9cbe947", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -193,7 +194,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9bd92341", + "id": "899ad203", "metadata": {}, "outputs": [], "source": [ @@ -202,7 +203,7 @@ }, { "cell_type": "markdown", - "id": "f5c6ed39", + "id": "b95c6952", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -211,7 +212,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b9b666af", + "id": "2c4ea55f", "metadata": {}, "outputs": [], "source": [ @@ -292,7 +293,7 @@ }, { "cell_type": "markdown", - "id": "da175b7b", + "id": "39bae663", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -301,7 +302,7 @@ { "cell_type": "code", "execution_count": null, - "id": "98f1ac50", + "id": "0152a21c", "metadata": {}, "outputs": [], "source": [ @@ -338,7 +339,7 @@ }, { "cell_type": "markdown", - "id": "12bac21a", + "id": "f9f2b181", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -353,7 +354,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b413ed5a", + "id": "fd2ef4ee", "metadata": {}, "outputs": [], "source": [ @@ -390,7 +391,7 @@ }, { "cell_type": "markdown", - "id": "98e77924", + "id": "9c8cf4b4", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -407,7 +408,7 @@ { "cell_type": "code", "execution_count": null, - "id": "876ba0f2", + "id": "60083d1b", "metadata": {}, "outputs": [], "source": [ @@ -417,7 +418,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2af68741", + "id": "7d5174e3", "metadata": {}, "outputs": [], "source": [ @@ -428,7 +429,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5916c3cb", + "id": "a608cf38", "metadata": {}, "outputs": [], "source": [ @@ -438,7 +439,7 @@ }, { "cell_type": "markdown", - "id": "ac6d7448", + "id": "ac022c04", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -451,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf494d0a", + "id": "f2a09834", "metadata": {}, "outputs": [], "source": [ @@ -461,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "357b2c86", + "id": "82515637", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -474,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "84bd7549", + "id": "cffa3d1a", "metadata": {}, "outputs": [], "source": [ @@ -484,7 +485,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d266d2a1", + "id": "4695344e", "metadata": {}, "outputs": [], "source": [ @@ -497,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6328de8f", + "id": "7c9cc607", "metadata": {}, "outputs": [], "source": [ @@ -509,7 +510,7 @@ }, { "cell_type": "markdown", - "id": "1b81038b", + "id": "e83dac04", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index cbf1dcf0..2e33d5ce 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "26826211", + "id": "61350549", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "832b88b4", + "id": "5cdda092", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,31 +27,32 @@ { "cell_type": "code", "execution_count": null, - "id": "969bbbcc", + "id": "ef62a444", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer \n" + "!pip install -q data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "41748474", + "id": "c3af1cf7", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { "cell_type": "markdown", - "id": "4e8014d8", + "id": "989c24bc", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -62,7 +63,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7352bbf0", + "id": "d360a00a", "metadata": {}, "outputs": [], "source": [ @@ -83,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "14963f9f", + "id": "77231827", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -96,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eb618d54", + "id": "0a595ceb", "metadata": {}, "outputs": [], "source": [ @@ -105,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "caa3f57f", + "id": "a719760f", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -122,7 +123,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5549152c", + "id": "f3c008cb", "metadata": {}, "outputs": [], "source": [ @@ -154,7 +155,7 @@ }, { "cell_type": "markdown", - "id": "e7c5d645", + "id": "d21ae4eb", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -169,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ac98165", + "id": "1f59b3c3", "metadata": {}, "outputs": [], "source": [ @@ -178,7 +179,7 @@ }, { "cell_type": "markdown", - "id": "bb99873c", + "id": "b7ce6ffe", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -205,7 +206,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a1619f2e", + "id": "9e4a9df9", "metadata": {}, "outputs": [], "source": [ @@ -233,7 +234,7 @@ }, { "cell_type": "markdown", - "id": "8895a7f0", + "id": "d0f95e86", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -242,7 +243,7 @@ { "cell_type": "code", "execution_count": null, - "id": "af32e36e", + "id": "0eef341c", "metadata": {}, "outputs": [], "source": [ @@ -351,7 +352,7 @@ }, { "cell_type": "markdown", - "id": "b8241c06", + "id": "ca232974", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -368,7 +369,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ba29a63d", + "id": "cc41ccd4", "metadata": {}, "outputs": [], "source": [ @@ -422,7 +423,7 @@ }, { "cell_type": "markdown", - "id": "4f733458", + "id": "1cc86025", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -439,7 +440,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c5efd286", + "id": "1210f637", "metadata": {}, "outputs": [], "source": [ @@ -449,7 +450,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5245e40f", + "id": "c1bec249", "metadata": {}, "outputs": [], "source": [ @@ -460,7 +461,7 @@ { "cell_type": "code", "execution_count": null, - "id": "07e46b28", + "id": "a00c2f22", "metadata": {}, "outputs": [], "source": [ @@ -470,7 +471,7 @@ }, { "cell_type": "markdown", - "id": "6388cffb", + "id": "e26f2b05", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -483,7 +484,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4af08e47", + "id": "6b0c16c2", "metadata": {}, "outputs": [], "source": [ @@ -493,7 +494,7 @@ }, { "cell_type": "markdown", - "id": "24884cc3", + "id": "6029cce2", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -506,7 +507,7 @@ { "cell_type": "code", "execution_count": null, - "id": "03f2d0e2", + "id": "ede95f7d", "metadata": {}, "outputs": [], "source": [ @@ -516,7 +517,7 @@ { "cell_type": "code", "execution_count": null, - "id": "98db8bc6", + "id": "db0b85e1", "metadata": {}, "outputs": [], "source": [ @@ -529,7 +530,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0234d630", + "id": "676da66a", "metadata": {}, "outputs": [], "source": [ @@ -541,7 +542,7 @@ }, { "cell_type": "markdown", - "id": "e4f790c7", + "id": "879ce764", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 3b1ab4fa..beaade14 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "b52e3e14", + "id": "180d64a4", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "e8bdd897", + "id": "33dc31a2", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,31 +27,32 @@ { "cell_type": "code", "execution_count": null, - "id": "9f8f43fd", + "id": "3b223f0e", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer \n" + "!pip install -q data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "5cd30df0", + "id": "9a2ab316", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { "cell_type": "markdown", - "id": "19140f29", + "id": "7f061323", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -62,7 +63,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5111abc3", + "id": "c4be8e0f", "metadata": {}, "outputs": [], "source": [ @@ -77,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "7c05285e", + "id": "b89575d7", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -90,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a6d36f9f", + "id": "236a2916", "metadata": {}, "outputs": [], "source": [ @@ -99,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "737515ef", + "id": "4df6e463", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -116,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ff4d27e7", + "id": "53d41281", "metadata": {}, "outputs": [], "source": [ @@ -148,7 +149,7 @@ }, { "cell_type": "markdown", - "id": "05d3f7b5", + "id": "5cd3f03e", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -163,7 +164,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59a6cf53", + "id": "6bca047c", "metadata": {}, "outputs": [], "source": [ @@ -172,7 +173,7 @@ }, { "cell_type": "markdown", - "id": "0698182b", + "id": "875c849c", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -197,7 +198,7 @@ { "cell_type": "code", "execution_count": null, - "id": "623fb7fa", + "id": "4383058a", "metadata": {}, "outputs": [], "source": [ @@ -215,7 +216,7 @@ }, { "cell_type": "markdown", - "id": "24dcb1b2", + "id": "81b460ba", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -232,7 +233,7 @@ { "cell_type": "code", "execution_count": null, - "id": "773fb7e4", + "id": "55054448", "metadata": {}, "outputs": [], "source": [ @@ -322,7 +323,7 @@ }, { "cell_type": "markdown", - "id": "86c89239", + "id": "48fc0954", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -339,7 +340,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47cdc500", + "id": "eafe69da", "metadata": {}, "outputs": [], "source": [ @@ -349,7 +350,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b06eface", + "id": "78a93876", "metadata": {}, "outputs": [], "source": [ @@ -360,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e5cafa10", + "id": "1004f0e5", "metadata": {}, "outputs": [], "source": [ @@ -370,7 +371,7 @@ }, { "cell_type": "markdown", - "id": "ef9e55d7", + "id": "e712bc10", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -383,7 +384,7 @@ { "cell_type": "code", "execution_count": null, - "id": "993cd27f", + "id": "1f28cf03", "metadata": {}, "outputs": [], "source": [ @@ -393,7 +394,7 @@ }, { "cell_type": "markdown", - "id": "960e2c6c", + "id": "d4511bf3", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -406,7 +407,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1bd696b5", + "id": "5a1ea546", "metadata": {}, "outputs": [], "source": [ @@ -416,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b1be27a", + "id": "ec54d5b7", "metadata": {}, "outputs": [], "source": [ @@ -429,7 +430,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4101f6a2", + "id": "d0bfcb9b", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +442,7 @@ }, { "cell_type": "markdown", - "id": "857a0ded", + "id": "cebfcbec", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 42bbac67..48bae7e1 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "1e86fc40", + "id": "16835351", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "c59cb50b", + "id": "52a4f774", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "109f3eed", + "id": "7d9d836f", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -36,31 +36,32 @@ { "cell_type": "code", "execution_count": null, - "id": "6aa21eed", + "id": "611577bd", "metadata": {}, "outputs": [], "source": [ "# Install data-designer and dependencies\n", - "!pip install -q data-designer pillow\n" + "!pip install -q data-designer pillow" ] }, { "cell_type": "code", "execution_count": null, - "id": "dd12beef", + "id": "8f9cd95f", "metadata": {}, "outputs": [], "source": [ "# Set up NVIDIA API key from Colab secrets\n", + "import os\n", + "\n", "from google.colab import userdata\n", "\n", - "import os\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n" + "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" ] }, { "cell_type": "markdown", - "id": "c4162d82", + "id": "1ed0cc0d", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -71,7 +72,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ecec234b", + "id": "2d08fd71", "metadata": {}, "outputs": [], "source": [ @@ -102,7 +103,7 @@ }, { "cell_type": "markdown", - "id": "504f1d40", + "id": "a210d017", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -115,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e59c8ab0", + "id": "c3eb41f3", "metadata": {}, "outputs": [], "source": [ @@ -124,7 +125,7 @@ }, { "cell_type": "markdown", - "id": "71bb9093", + "id": "2a279c56", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -141,7 +142,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dfceb4f8", + "id": "2c93ec09", "metadata": {}, "outputs": [], "source": [ @@ -164,7 +165,7 @@ }, { "cell_type": "markdown", - "id": "08426a33", + "id": "4e2b4500", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -179,7 +180,7 @@ { "cell_type": "code", "execution_count": null, - "id": "afad97d0", + "id": "a7dc93d8", "metadata": {}, "outputs": [], "source": [ @@ -188,7 +189,7 @@ }, { "cell_type": "markdown", - "id": "a4e86661", + "id": "1a914dc3", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -205,7 +206,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b906647b", + "id": "dacf9004", "metadata": {}, "outputs": [], "source": [ @@ -220,7 +221,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0e2827d8", + "id": "e602b972", "metadata": {}, "outputs": [], "source": [ @@ -268,7 +269,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aff5bddf", + "id": "7ad701fd", "metadata": {}, "outputs": [], "source": [ @@ -286,7 +287,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b46db348", + "id": "c6ade03b", "metadata": {}, "outputs": [], "source": [ @@ -296,7 +297,7 @@ { "cell_type": "code", "execution_count": null, - "id": "93014a2a", + "id": "5dd4c526", "metadata": {}, "outputs": [], "source": [ @@ -310,7 +311,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b5d75ac8", + "id": "6fd54792", "metadata": { "lines_to_next_cell": 2 }, @@ -339,7 +340,7 @@ }, { "cell_type": "markdown", - "id": "0bf45f8a", + "id": "69967b8d", "metadata": { "lines_to_next_cell": 2 }, @@ -347,7 +348,7 @@ }, { "cell_type": "markdown", - "id": "ce1f9b8f", + "id": "827d7a34", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -364,7 +365,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eb4ebb7e", + "id": "27b9d461", "metadata": {}, "outputs": [], "source": [ @@ -374,7 +375,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28bc690a", + "id": "f56d6c41", "metadata": {}, "outputs": [], "source": [ @@ -385,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "572bdc4c", + "id": "d828b41d", "metadata": {}, "outputs": [], "source": [ @@ -395,7 +396,7 @@ }, { "cell_type": "markdown", - "id": "35a24fdd", + "id": "60bf5e50", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -408,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e3ec9f06", + "id": "fea6e43d", "metadata": {}, "outputs": [], "source": [ @@ -418,7 +419,7 @@ }, { "cell_type": "markdown", - "id": "10a548d1", + "id": "83fa6b55", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -429,7 +430,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b0dfa425", + "id": "6fa6945d", "metadata": { "lines_to_next_cell": 2 }, @@ -453,7 +454,7 @@ }, { "cell_type": "markdown", - "id": "b843e820", + "id": "96257f6d", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -466,7 +467,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6856fa9f", + "id": "a4308cc9", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8d656659", + "id": "1a0747c1", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +490,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9dfa4c11", + "id": "6cfe8cad", "metadata": {}, "outputs": [], "source": [ @@ -501,7 +502,7 @@ }, { "cell_type": "markdown", - "id": "15fcf9f1", + "id": "2ab3bedd", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py index b7dd8f42..f5180849 100644 --- a/docs/scripts/generate_colab_notebooks.py +++ b/docs/scripts/generate_colab_notebooks.py @@ -26,21 +26,20 @@ """ ADDITIONAL_DEPENDENCIES = { - "4-providing-images-as-context.py": "pillow", + "4-providing-images-as-context.py": " pillow", } COLAB_INSTALL_CELL = """\ # Install data-designer and dependencies -!pip install -q data-designer {} -""" +!pip install -q data-designer{}""" COLAB_API_KEY_CELL = """\ # Set up NVIDIA API key from Colab secrets +import os + from google.colab import userdata -import os -os.environ["NVIDIA_API_KEY"] = userdata.get("NVIDIA_API_KEY") -""" +os.environ["NVIDIA_API_KEY"] = userdata.get("NVIDIA_API_KEY")""" def create_colab_setup_cells(additional_dependencies: str) -> list[NotebookNode]: From 2b562fd575c7dd9a6f48ef57ca53edccc66a5bd8 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 11 Dec 2025 18:48:20 -0300 Subject: [PATCH 07/14] diff needs to be empty --- .github/workflows/check-colab-notebooks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-colab-notebooks.yml b/.github/workflows/check-colab-notebooks.yml index 3d1d35a1..7116b4a0 100644 --- a/.github/workflows/check-colab-notebooks.yml +++ b/.github/workflows/check-colab-notebooks.yml @@ -38,8 +38,8 @@ jobs: - name: Check for differences run: | # Get the diff, filtering out cell ID changes (which are randomly generated) - # Filter out: diff headers, file markers, hunk markers, and "id" lines - MEANINGFUL_DIFF=$(git diff docs/colab_notebooks/ | grep -E '^[+-]' | grep -v '^[+-]{3}' | grep -vE '^[+-]\s*"id": "[0-9a-fA-F]+",?$' || true) + # Filter out: file markers (--- and +++), and "id" lines + MEANINGFUL_DIFF=$(git diff docs/colab_notebooks/ | grep -E '^[+-]' | grep -v '^---' | grep -v '^+++' | grep -vE '^[+-]\s*"id": "[0-9a-fA-F]+",?$' || true) if [ -z "$MEANINGFUL_DIFF" ]; then echo "βœ… Colab notebooks are up-to-date (ignoring cell ID changes)" From 8ea7044df08baf2655cb9a1d1f70a565d0b1ed64 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Fri, 12 Dec 2025 10:51:35 -0300 Subject: [PATCH 08/14] addressing comments --- docs/colab_notebooks/1-the-basics.ipynb | 68 +++++++++-------- ...ctured-outputs-and-jinja-expressions.ipynb | 64 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 60 ++++++++------- .../4-providing-images-as-context.ipynb | 74 +++++++++---------- docs/scripts/generate_colab_notebooks.py | 15 ++-- 5 files changed, 138 insertions(+), 143 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index 356cbc88..3619ba2c 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "4b31b914", + "id": "ec9667da", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,33 +14,31 @@ }, { "cell_type": "markdown", - "id": "591c96b6", + "id": "341d4a48", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", "\n", - "Run the cells below to set up the environment for Google Colab.\n" + "Run the cells below to install the dependencies and set up the API key.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "8e03a646", + "id": "79ecb0d0", "metadata": {}, "outputs": [], "source": [ - "# Install data-designer and dependencies\n", - "!pip install -q data-designer" + "!pip install -q data-designer>=0.1.6" ] }, { "cell_type": "code", "execution_count": null, - "id": "5fa2627e", + "id": "60175007", "metadata": {}, "outputs": [], "source": [ - "# Set up NVIDIA API key from Colab secrets\n", "import os\n", "\n", "from google.colab import userdata\n", @@ -50,7 +48,7 @@ }, { "cell_type": "markdown", - "id": "7142c668", + "id": "58c7a23c", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -61,7 +59,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4d314046", + "id": "6b51048b", "metadata": {}, "outputs": [], "source": [ @@ -82,7 +80,7 @@ }, { "cell_type": "markdown", - "id": "2c2c91a1", + "id": "7743442a", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -95,7 +93,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c2b97c7d", + "id": "47a2143c", "metadata": {}, "outputs": [], "source": [ @@ -104,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "385067da", + "id": "23faa8ee", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -121,7 +119,7 @@ { "cell_type": "code", "execution_count": null, - "id": "808602fc", + "id": "3a31a803", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "7779673f", + "id": "5dd6cc4b", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -168,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4a16bb2c", + "id": "2f6daf5b", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +175,7 @@ }, { "cell_type": "markdown", - "id": "e9cbe947", + "id": "ca28bf79", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -194,7 +192,7 @@ { "cell_type": "code", "execution_count": null, - "id": "899ad203", + "id": "dc2970b7", "metadata": {}, "outputs": [], "source": [ @@ -203,7 +201,7 @@ }, { "cell_type": "markdown", - "id": "b95c6952", + "id": "dbe2ba06", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -212,7 +210,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2c4ea55f", + "id": "b78ee838", "metadata": {}, "outputs": [], "source": [ @@ -293,7 +291,7 @@ }, { "cell_type": "markdown", - "id": "39bae663", + "id": "4aa0d91e", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -302,7 +300,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0152a21c", + "id": "5297ff97", "metadata": {}, "outputs": [], "source": [ @@ -339,7 +337,7 @@ }, { "cell_type": "markdown", - "id": "f9f2b181", + "id": "39b0ab09", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -354,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fd2ef4ee", + "id": "b92eea36", "metadata": {}, "outputs": [], "source": [ @@ -391,7 +389,7 @@ }, { "cell_type": "markdown", - "id": "9c8cf4b4", + "id": "ae82081b", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -408,7 +406,7 @@ { "cell_type": "code", "execution_count": null, - "id": "60083d1b", + "id": "61ad8562", "metadata": {}, "outputs": [], "source": [ @@ -418,7 +416,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d5174e3", + "id": "91d30f43", "metadata": {}, "outputs": [], "source": [ @@ -429,7 +427,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a608cf38", + "id": "86090f7f", "metadata": {}, "outputs": [], "source": [ @@ -439,7 +437,7 @@ }, { "cell_type": "markdown", - "id": "ac022c04", + "id": "bdc88621", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -452,7 +450,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f2a09834", + "id": "ef1f2426", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +460,7 @@ }, { "cell_type": "markdown", - "id": "82515637", + "id": "4e91097e", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -475,7 +473,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cffa3d1a", + "id": "44192aab", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +483,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4695344e", + "id": "6725271d", "metadata": {}, "outputs": [], "source": [ @@ -498,7 +496,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c9cc607", + "id": "491d20af", "metadata": {}, "outputs": [], "source": [ @@ -510,7 +508,7 @@ }, { "cell_type": "markdown", - "id": "e83dac04", + "id": "70f9c440", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 2e33d5ce..3298615a 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "61350549", + "id": "3f53bedb", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,33 +16,31 @@ }, { "cell_type": "markdown", - "id": "5cdda092", + "id": "4086037e", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", "\n", - "Run the cells below to set up the environment for Google Colab.\n" + "Run the cells below to install the dependencies and set up the API key.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "ef62a444", + "id": "b2afd1f7", "metadata": {}, "outputs": [], "source": [ - "# Install data-designer and dependencies\n", - "!pip install -q data-designer" + "!pip install -q data-designer>=0.1.6" ] }, { "cell_type": "code", "execution_count": null, - "id": "c3af1cf7", + "id": "07e57a6e", "metadata": {}, "outputs": [], "source": [ - "# Set up NVIDIA API key from Colab secrets\n", "import os\n", "\n", "from google.colab import userdata\n", @@ -52,7 +50,7 @@ }, { "cell_type": "markdown", - "id": "989c24bc", + "id": "138c072c", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +61,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d360a00a", + "id": "77acc96a", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +82,7 @@ }, { "cell_type": "markdown", - "id": "77231827", + "id": "d4269003", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +95,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0a595ceb", + "id": "d622824f", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +104,7 @@ }, { "cell_type": "markdown", - "id": "a719760f", + "id": "ee7e7e22", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -123,7 +121,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f3c008cb", + "id": "fcf88403", "metadata": {}, "outputs": [], "source": [ @@ -155,7 +153,7 @@ }, { "cell_type": "markdown", - "id": "d21ae4eb", + "id": "9f4a36da", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -170,7 +168,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1f59b3c3", + "id": "c55b0aac", "metadata": {}, "outputs": [], "source": [ @@ -179,7 +177,7 @@ }, { "cell_type": "markdown", - "id": "b7ce6ffe", + "id": "6197bced", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -206,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e4a9df9", + "id": "501026dd", "metadata": {}, "outputs": [], "source": [ @@ -234,7 +232,7 @@ }, { "cell_type": "markdown", - "id": "d0f95e86", + "id": "e07235d6", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -243,7 +241,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0eef341c", + "id": "643fca70", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +350,7 @@ }, { "cell_type": "markdown", - "id": "ca232974", + "id": "65af2821", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -369,7 +367,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cc41ccd4", + "id": "a8669e16", "metadata": {}, "outputs": [], "source": [ @@ -423,7 +421,7 @@ }, { "cell_type": "markdown", - "id": "1cc86025", + "id": "955833fc", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -440,7 +438,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1210f637", + "id": "602ed286", "metadata": {}, "outputs": [], "source": [ @@ -450,7 +448,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c1bec249", + "id": "c3e6f74e", "metadata": {}, "outputs": [], "source": [ @@ -461,7 +459,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a00c2f22", + "id": "53f7bcb3", "metadata": {}, "outputs": [], "source": [ @@ -471,7 +469,7 @@ }, { "cell_type": "markdown", - "id": "e26f2b05", + "id": "fddf7bad", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -484,7 +482,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6b0c16c2", + "id": "391f45b1", "metadata": {}, "outputs": [], "source": [ @@ -494,7 +492,7 @@ }, { "cell_type": "markdown", - "id": "6029cce2", + "id": "f7dae0a0", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -507,7 +505,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ede95f7d", + "id": "9b0677ae", "metadata": {}, "outputs": [], "source": [ @@ -517,7 +515,7 @@ { "cell_type": "code", "execution_count": null, - "id": "db0b85e1", + "id": "1945840f", "metadata": {}, "outputs": [], "source": [ @@ -530,7 +528,7 @@ { "cell_type": "code", "execution_count": null, - "id": "676da66a", + "id": "b3c85e29", "metadata": {}, "outputs": [], "source": [ @@ -542,7 +540,7 @@ }, { "cell_type": "markdown", - "id": "879ce764", + "id": "26a2f7cf", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index beaade14..82ea0f02 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "180d64a4", + "id": "fb66d4f6", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,33 +16,31 @@ }, { "cell_type": "markdown", - "id": "33dc31a2", + "id": "720d6b46", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", "\n", - "Run the cells below to set up the environment for Google Colab.\n" + "Run the cells below to install the dependencies and set up the API key.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "3b223f0e", + "id": "4c24567c", "metadata": {}, "outputs": [], "source": [ - "# Install data-designer and dependencies\n", - "!pip install -q data-designer" + "!pip install -q data-designer>=0.1.6" ] }, { "cell_type": "code", "execution_count": null, - "id": "9a2ab316", + "id": "3f02f461", "metadata": {}, "outputs": [], "source": [ - "# Set up NVIDIA API key from Colab secrets\n", "import os\n", "\n", "from google.colab import userdata\n", @@ -52,7 +50,7 @@ }, { "cell_type": "markdown", - "id": "7f061323", + "id": "4f9d5b8e", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +61,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c4be8e0f", + "id": "f8d7c320", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "b89575d7", + "id": "17689e82", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "236a2916", + "id": "54947242", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +98,7 @@ }, { "cell_type": "markdown", - "id": "4df6e463", + "id": "48bb816e", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53d41281", + "id": "9158afff", "metadata": {}, "outputs": [], "source": [ @@ -149,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "5cd3f03e", + "id": "caa847b8", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -164,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6bca047c", + "id": "e9fbbb41", "metadata": {}, "outputs": [], "source": [ @@ -173,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "875c849c", + "id": "840a96be", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -198,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4383058a", + "id": "2b08702a", "metadata": {}, "outputs": [], "source": [ @@ -216,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "81b460ba", + "id": "cf3075dc", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -233,7 +231,7 @@ { "cell_type": "code", "execution_count": null, - "id": "55054448", + "id": "72ef6998", "metadata": {}, "outputs": [], "source": [ @@ -323,7 +321,7 @@ }, { "cell_type": "markdown", - "id": "48fc0954", + "id": "38837ef0", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -340,7 +338,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eafe69da", + "id": "849987f6", "metadata": {}, "outputs": [], "source": [ @@ -350,7 +348,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78a93876", + "id": "b993fdea", "metadata": {}, "outputs": [], "source": [ @@ -361,7 +359,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1004f0e5", + "id": "6d9fbf4d", "metadata": {}, "outputs": [], "source": [ @@ -371,7 +369,7 @@ }, { "cell_type": "markdown", - "id": "e712bc10", + "id": "9ad17ac1", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -384,7 +382,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1f28cf03", + "id": "92487456", "metadata": {}, "outputs": [], "source": [ @@ -394,7 +392,7 @@ }, { "cell_type": "markdown", - "id": "d4511bf3", + "id": "e3219bb8", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -407,7 +405,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5a1ea546", + "id": "488e15bf", "metadata": {}, "outputs": [], "source": [ @@ -417,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ec54d5b7", + "id": "4bc23ee4", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +428,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d0bfcb9b", + "id": "48ca6bb9", "metadata": {}, "outputs": [], "source": [ @@ -442,7 +440,7 @@ }, { "cell_type": "markdown", - "id": "cebfcbec", + "id": "8bfff6c5", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 48bae7e1..50778787 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "16835351", + "id": "70f9bdbf", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "52a4f774", + "id": "17bf15ef", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,33 +25,31 @@ }, { "cell_type": "markdown", - "id": "7d9d836f", + "id": "47b661ec", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", "\n", - "Run the cells below to set up the environment for Google Colab.\n" + "Run the cells below to install the dependencies and set up the API key.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "611577bd", + "id": "caac58a1", "metadata": {}, "outputs": [], "source": [ - "# Install data-designer and dependencies\n", - "!pip install -q data-designer pillow" + "!pip install -q data-designer>=0.1.6 pillow>=12.0.0" ] }, { "cell_type": "code", "execution_count": null, - "id": "8f9cd95f", + "id": "af9bb850", "metadata": {}, "outputs": [], "source": [ - "# Set up NVIDIA API key from Colab secrets\n", "import os\n", "\n", "from google.colab import userdata\n", @@ -61,7 +59,7 @@ }, { "cell_type": "markdown", - "id": "1ed0cc0d", + "id": "52062953", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -72,7 +70,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2d08fd71", + "id": "5406d2f8", "metadata": {}, "outputs": [], "source": [ @@ -103,7 +101,7 @@ }, { "cell_type": "markdown", - "id": "a210d017", + "id": "18d9e8a8", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -116,7 +114,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c3eb41f3", + "id": "f7ea1cfd", "metadata": {}, "outputs": [], "source": [ @@ -125,7 +123,7 @@ }, { "cell_type": "markdown", - "id": "2a279c56", + "id": "498dbf37", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -142,7 +140,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2c93ec09", + "id": "d02da766", "metadata": {}, "outputs": [], "source": [ @@ -165,7 +163,7 @@ }, { "cell_type": "markdown", - "id": "4e2b4500", + "id": "652e35c4", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -180,7 +178,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a7dc93d8", + "id": "9105d13a", "metadata": {}, "outputs": [], "source": [ @@ -189,7 +187,7 @@ }, { "cell_type": "markdown", - "id": "1a914dc3", + "id": "2f2e0c3f", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -206,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dacf9004", + "id": "0868eefd", "metadata": {}, "outputs": [], "source": [ @@ -221,7 +219,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e602b972", + "id": "017fd352", "metadata": {}, "outputs": [], "source": [ @@ -269,7 +267,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7ad701fd", + "id": "82c3e194", "metadata": {}, "outputs": [], "source": [ @@ -287,7 +285,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c6ade03b", + "id": "6ccb2131", "metadata": {}, "outputs": [], "source": [ @@ -297,7 +295,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5dd4c526", + "id": "2bee55a3", "metadata": {}, "outputs": [], "source": [ @@ -311,7 +309,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6fd54792", + "id": "38535fad", "metadata": { "lines_to_next_cell": 2 }, @@ -340,7 +338,7 @@ }, { "cell_type": "markdown", - "id": "69967b8d", + "id": "1dfeef0f", "metadata": { "lines_to_next_cell": 2 }, @@ -348,7 +346,7 @@ }, { "cell_type": "markdown", - "id": "827d7a34", + "id": "540c1560", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -365,7 +363,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27b9d461", + "id": "c203e707", "metadata": {}, "outputs": [], "source": [ @@ -375,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f56d6c41", + "id": "477a82bf", "metadata": {}, "outputs": [], "source": [ @@ -386,7 +384,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d828b41d", + "id": "4add7bae", "metadata": {}, "outputs": [], "source": [ @@ -396,7 +394,7 @@ }, { "cell_type": "markdown", - "id": "60bf5e50", + "id": "2add3b7e", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -409,7 +407,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fea6e43d", + "id": "f9420b94", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +417,7 @@ }, { "cell_type": "markdown", - "id": "83fa6b55", + "id": "2561626c", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -430,7 +428,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6fa6945d", + "id": "da172f7a", "metadata": { "lines_to_next_cell": 2 }, @@ -454,7 +452,7 @@ }, { "cell_type": "markdown", - "id": "96257f6d", + "id": "d55e0dcf", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -467,7 +465,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a4308cc9", + "id": "3ebc2769", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1a0747c1", + "id": "cc034afb", "metadata": {}, "outputs": [], "source": [ @@ -490,7 +488,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6cfe8cad", + "id": "b769554e", "metadata": {}, "outputs": [], "source": [ @@ -502,7 +500,7 @@ }, { "cell_type": "markdown", - "id": "2ab3bedd", + "id": "9d19e830", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py index f5180849..01fc65f1 100644 --- a/docs/scripts/generate_colab_notebooks.py +++ b/docs/scripts/generate_colab_notebooks.py @@ -17,24 +17,24 @@ from nbformat import NotebookNode from nbformat.v4 import new_code_cell, new_markdown_cell +from data_designer import __version__ as data_designer_version + IMPORT_SECTION_MARKER = "### πŸ“¦ Import the essentials" COLAB_SETUP_MARKDOWN = """\ ### ⚑ Colab Setup -Run the cells below to set up the environment for Google Colab. +Run the cells below to install the dependencies and set up the API key. """ ADDITIONAL_DEPENDENCIES = { - "4-providing-images-as-context.py": " pillow", + "4-providing-images-as-context.py": " pillow>=12.0.0", } COLAB_INSTALL_CELL = """\ -# Install data-designer and dependencies -!pip install -q data-designer{}""" +!pip install -q data-designer>={version}{deps}""" COLAB_API_KEY_CELL = """\ -# Set up NVIDIA API key from Colab secrets import os from google.colab import userdata @@ -46,7 +46,10 @@ def create_colab_setup_cells(additional_dependencies: str) -> list[NotebookNode] """Create the Colab-specific setup cells to inject before imports.""" return [ new_markdown_cell(source=COLAB_SETUP_MARKDOWN), - new_code_cell(source=COLAB_INSTALL_CELL.format(additional_dependencies)), + new_code_cell(source=COLAB_INSTALL_CELL.format( + version=".".join(data_designer_version.split(".")[:3]), + deps=additional_dependencies, + )), new_code_cell(source=COLAB_API_KEY_CELL), ] From 26b36f9ee336366eb4c14bcc8b90e16757870aec Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Fri, 12 Dec 2025 11:06:48 -0300 Subject: [PATCH 09/14] fixes, getpass --- docs/colab_notebooks/1-the-basics.ipynb | 72 +++++++++-------- ...ctured-outputs-and-jinja-expressions.ipynb | 68 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 64 ++++++++------- .../4-providing-images-as-context.ipynb | 78 ++++++++++--------- docs/scripts/generate_colab_notebooks.py | 22 ++++-- 5 files changed, 165 insertions(+), 139 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index 3619ba2c..c096fae0 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "ec9667da", + "id": "e5e6a125", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,41 +14,45 @@ }, { "cell_type": "markdown", - "id": "341d4a48", + "id": "3624a628", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", "\n", - "Run the cells below to install the dependencies and set up the API key.\n" + "Run the cells below to install the dependencies and set up the API key. If you don't have an API key, you can generate one from [build.nvidia.com](https://build.nvidia.com).\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "79ecb0d0", + "id": "06637f1c", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer>=0.1.6" + "!pip install -q data-designer>=0.1.5" ] }, { "cell_type": "code", "execution_count": null, - "id": "60175007", + "id": "b22aaac1", "metadata": {}, "outputs": [], "source": [ + "import getpass\n", "import os\n", "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "try:\n", + " os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n", + "except userdata.SecretNotFoundError:\n", + " os.environ[\"NVIDIA_API_KEY\"] = getpass.getpass(\"Enter your NVIDIA API key: \")" ] }, { "cell_type": "markdown", - "id": "58c7a23c", + "id": "3480b8ce", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -59,7 +63,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6b51048b", + "id": "57b88371", "metadata": {}, "outputs": [], "source": [ @@ -80,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "7743442a", + "id": "e395b791", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -93,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47a2143c", + "id": "074bbb9d", "metadata": {}, "outputs": [], "source": [ @@ -102,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "23faa8ee", + "id": "0991c1fb", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -119,7 +123,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3a31a803", + "id": "f9611ea0", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +155,7 @@ }, { "cell_type": "markdown", - "id": "5dd6cc4b", + "id": "c7cd421b", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -166,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2f6daf5b", + "id": "2896ca4d", "metadata": {}, "outputs": [], "source": [ @@ -175,7 +179,7 @@ }, { "cell_type": "markdown", - "id": "ca28bf79", + "id": "e2c417a1", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -192,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dc2970b7", + "id": "e7a7fbc3", "metadata": {}, "outputs": [], "source": [ @@ -201,7 +205,7 @@ }, { "cell_type": "markdown", - "id": "dbe2ba06", + "id": "4d272b48", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -210,7 +214,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b78ee838", + "id": "a7884ce5", "metadata": {}, "outputs": [], "source": [ @@ -291,7 +295,7 @@ }, { "cell_type": "markdown", - "id": "4aa0d91e", + "id": "b0c6c331", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -300,7 +304,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5297ff97", + "id": "c05ef860", "metadata": {}, "outputs": [], "source": [ @@ -337,7 +341,7 @@ }, { "cell_type": "markdown", - "id": "39b0ab09", + "id": "47ad6af8", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -352,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b92eea36", + "id": "3a2d9680", "metadata": {}, "outputs": [], "source": [ @@ -389,7 +393,7 @@ }, { "cell_type": "markdown", - "id": "ae82081b", + "id": "d7f99a8f", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -406,7 +410,7 @@ { "cell_type": "code", "execution_count": null, - "id": "61ad8562", + "id": "e5637e52", "metadata": {}, "outputs": [], "source": [ @@ -416,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "91d30f43", + "id": "50f9c5ca", "metadata": {}, "outputs": [], "source": [ @@ -427,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "86090f7f", + "id": "20c293e8", "metadata": {}, "outputs": [], "source": [ @@ -437,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "bdc88621", + "id": "185f254b", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -450,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef1f2426", + "id": "2e5e82df", "metadata": {}, "outputs": [], "source": [ @@ -460,7 +464,7 @@ }, { "cell_type": "markdown", - "id": "4e91097e", + "id": "5c685e44", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -473,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "44192aab", + "id": "7a96c763", "metadata": {}, "outputs": [], "source": [ @@ -483,7 +487,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6725271d", + "id": "0aef9021", "metadata": {}, "outputs": [], "source": [ @@ -496,7 +500,7 @@ { "cell_type": "code", "execution_count": null, - "id": "491d20af", + "id": "39a7932a", "metadata": {}, "outputs": [], "source": [ @@ -508,7 +512,7 @@ }, { "cell_type": "markdown", - "id": "70f9c440", + "id": "f329ebf9", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 3298615a..ebfa43e1 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "3f53bedb", + "id": "1429ff3c", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,41 +16,45 @@ }, { "cell_type": "markdown", - "id": "4086037e", + "id": "afa0eed7", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", "\n", - "Run the cells below to install the dependencies and set up the API key.\n" + "Run the cells below to install the dependencies and set up the API key. If you don't have an API key, you can generate one from [build.nvidia.com](https://build.nvidia.com).\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "b2afd1f7", + "id": "f11db2d2", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer>=0.1.6" + "!pip install -q data-designer>=0.1.5" ] }, { "cell_type": "code", "execution_count": null, - "id": "07e57a6e", + "id": "e37da6da", "metadata": {}, "outputs": [], "source": [ + "import getpass\n", "import os\n", "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "try:\n", + " os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n", + "except userdata.SecretNotFoundError:\n", + " os.environ[\"NVIDIA_API_KEY\"] = getpass.getpass(\"Enter your NVIDIA API key: \")" ] }, { "cell_type": "markdown", - "id": "138c072c", + "id": "7b870a00", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -61,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "77acc96a", + "id": "d7f3d55b", "metadata": {}, "outputs": [], "source": [ @@ -82,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "d4269003", + "id": "9ec98c2a", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -95,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d622824f", + "id": "3374c9ca", "metadata": {}, "outputs": [], "source": [ @@ -104,7 +108,7 @@ }, { "cell_type": "markdown", - "id": "ee7e7e22", + "id": "a7ba9915", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -121,7 +125,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fcf88403", + "id": "97174b12", "metadata": {}, "outputs": [], "source": [ @@ -153,7 +157,7 @@ }, { "cell_type": "markdown", - "id": "9f4a36da", + "id": "146a50e6", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -168,7 +172,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c55b0aac", + "id": "bb746e7d", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +181,7 @@ }, { "cell_type": "markdown", - "id": "6197bced", + "id": "b9f8dd8b", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -204,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "501026dd", + "id": "9d20677d", "metadata": {}, "outputs": [], "source": [ @@ -232,7 +236,7 @@ }, { "cell_type": "markdown", - "id": "e07235d6", + "id": "dc580c93", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -241,7 +245,7 @@ { "cell_type": "code", "execution_count": null, - "id": "643fca70", + "id": "677921e9", "metadata": {}, "outputs": [], "source": [ @@ -350,7 +354,7 @@ }, { "cell_type": "markdown", - "id": "65af2821", + "id": "d9ea22bf", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -367,7 +371,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a8669e16", + "id": "abfa418a", "metadata": {}, "outputs": [], "source": [ @@ -421,7 +425,7 @@ }, { "cell_type": "markdown", - "id": "955833fc", + "id": "0129dd86", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -438,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "602ed286", + "id": "7adee26e", "metadata": {}, "outputs": [], "source": [ @@ -448,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c3e6f74e", + "id": "36fa3c62", "metadata": {}, "outputs": [], "source": [ @@ -459,7 +463,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53f7bcb3", + "id": "67bc0a68", "metadata": {}, "outputs": [], "source": [ @@ -469,7 +473,7 @@ }, { "cell_type": "markdown", - "id": "fddf7bad", + "id": "4dc3c12c", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -482,7 +486,7 @@ { "cell_type": "code", "execution_count": null, - "id": "391f45b1", + "id": "31717c45", "metadata": {}, "outputs": [], "source": [ @@ -492,7 +496,7 @@ }, { "cell_type": "markdown", - "id": "f7dae0a0", + "id": "27911ce8", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -505,7 +509,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b0677ae", + "id": "7c2167fc", "metadata": {}, "outputs": [], "source": [ @@ -515,7 +519,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1945840f", + "id": "5f170797", "metadata": {}, "outputs": [], "source": [ @@ -528,7 +532,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b3c85e29", + "id": "cdc8f67b", "metadata": {}, "outputs": [], "source": [ @@ -540,7 +544,7 @@ }, { "cell_type": "markdown", - "id": "26a2f7cf", + "id": "0c1a4900", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 82ea0f02..db560090 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "fb66d4f6", + "id": "40de3ce2", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,41 +16,45 @@ }, { "cell_type": "markdown", - "id": "720d6b46", + "id": "ade9e4d1", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", "\n", - "Run the cells below to install the dependencies and set up the API key.\n" + "Run the cells below to install the dependencies and set up the API key. If you don't have an API key, you can generate one from [build.nvidia.com](https://build.nvidia.com).\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "4c24567c", + "id": "001db352", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer>=0.1.6" + "!pip install -q data-designer>=0.1.5" ] }, { "cell_type": "code", "execution_count": null, - "id": "3f02f461", + "id": "68c77018", "metadata": {}, "outputs": [], "source": [ + "import getpass\n", "import os\n", "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "try:\n", + " os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n", + "except userdata.SecretNotFoundError:\n", + " os.environ[\"NVIDIA_API_KEY\"] = getpass.getpass(\"Enter your NVIDIA API key: \")" ] }, { "cell_type": "markdown", - "id": "4f9d5b8e", + "id": "d7c03ce9", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -61,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f8d7c320", + "id": "049baf76", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +80,7 @@ }, { "cell_type": "markdown", - "id": "17689e82", + "id": "98e7a674", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -89,7 +93,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54947242", + "id": "d3817dd2", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "48bb816e", + "id": "da5488bb", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -115,7 +119,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9158afff", + "id": "51c2675f", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "caa847b8", + "id": "c0c7a10e", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e9fbbb41", + "id": "a8aa8ab1", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +175,7 @@ }, { "cell_type": "markdown", - "id": "840a96be", + "id": "76a3082a", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -196,7 +200,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2b08702a", + "id": "be917d7a", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +218,7 @@ }, { "cell_type": "markdown", - "id": "cf3075dc", + "id": "9aef6903", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -231,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "72ef6998", + "id": "a3b2598a", "metadata": {}, "outputs": [], "source": [ @@ -321,7 +325,7 @@ }, { "cell_type": "markdown", - "id": "38837ef0", + "id": "b9410f6a", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -338,7 +342,7 @@ { "cell_type": "code", "execution_count": null, - "id": "849987f6", + "id": "5ed93859", "metadata": {}, "outputs": [], "source": [ @@ -348,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b993fdea", + "id": "702e1f9b", "metadata": {}, "outputs": [], "source": [ @@ -359,7 +363,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d9fbf4d", + "id": "eda5506a", "metadata": {}, "outputs": [], "source": [ @@ -369,7 +373,7 @@ }, { "cell_type": "markdown", - "id": "9ad17ac1", + "id": "bb355499", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -382,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "92487456", + "id": "71835cd6", "metadata": {}, "outputs": [], "source": [ @@ -392,7 +396,7 @@ }, { "cell_type": "markdown", - "id": "e3219bb8", + "id": "e3f1f1b0", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -405,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "488e15bf", + "id": "c45c2641", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4bc23ee4", + "id": "aa817871", "metadata": {}, "outputs": [], "source": [ @@ -428,7 +432,7 @@ { "cell_type": "code", "execution_count": null, - "id": "48ca6bb9", + "id": "0e96493b", "metadata": {}, "outputs": [], "source": [ @@ -440,7 +444,7 @@ }, { "cell_type": "markdown", - "id": "8bfff6c5", + "id": "a083ed48", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 50778787..4fe977fb 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "70f9bdbf", + "id": "809f5cfb", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "17bf15ef", + "id": "bcbeea27", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,41 +25,45 @@ }, { "cell_type": "markdown", - "id": "47b661ec", + "id": "8cba0a51", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", "\n", - "Run the cells below to install the dependencies and set up the API key.\n" + "Run the cells below to install the dependencies and set up the API key. If you don't have an API key, you can generate one from [build.nvidia.com](https://build.nvidia.com).\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "caac58a1", + "id": "d2b68d0b", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer>=0.1.6 pillow>=12.0.0" + "!pip install -q data-designer>=0.1.5 pillow>=12.0.0" ] }, { "cell_type": "code", "execution_count": null, - "id": "af9bb850", + "id": "d9ffdf48", "metadata": {}, "outputs": [], "source": [ + "import getpass\n", "import os\n", "\n", "from google.colab import userdata\n", "\n", - "os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")" + "try:\n", + " os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n", + "except userdata.SecretNotFoundError:\n", + " os.environ[\"NVIDIA_API_KEY\"] = getpass.getpass(\"Enter your NVIDIA API key: \")" ] }, { "cell_type": "markdown", - "id": "52062953", + "id": "6673f9ea", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -70,7 +74,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5406d2f8", + "id": "10031076", "metadata": {}, "outputs": [], "source": [ @@ -101,7 +105,7 @@ }, { "cell_type": "markdown", - "id": "18d9e8a8", + "id": "aa2128d0", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -114,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f7ea1cfd", + "id": "cd925eca", "metadata": {}, "outputs": [], "source": [ @@ -123,7 +127,7 @@ }, { "cell_type": "markdown", - "id": "498dbf37", + "id": "e9edeee0", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -140,7 +144,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d02da766", + "id": "26e59d13", "metadata": {}, "outputs": [], "source": [ @@ -163,7 +167,7 @@ }, { "cell_type": "markdown", - "id": "652e35c4", + "id": "8c24afd7", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -178,7 +182,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9105d13a", + "id": "555eb2f3", "metadata": {}, "outputs": [], "source": [ @@ -187,7 +191,7 @@ }, { "cell_type": "markdown", - "id": "2f2e0c3f", + "id": "7a87a2ec", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -204,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0868eefd", + "id": "d95a5705", "metadata": {}, "outputs": [], "source": [ @@ -219,7 +223,7 @@ { "cell_type": "code", "execution_count": null, - "id": "017fd352", + "id": "de2624c1", "metadata": {}, "outputs": [], "source": [ @@ -267,7 +271,7 @@ { "cell_type": "code", "execution_count": null, - "id": "82c3e194", + "id": "79886f68", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +289,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ccb2131", + "id": "ef8b5065", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2bee55a3", + "id": "97d977af", "metadata": {}, "outputs": [], "source": [ @@ -309,7 +313,7 @@ { "cell_type": "code", "execution_count": null, - "id": "38535fad", + "id": "d8338041", "metadata": { "lines_to_next_cell": 2 }, @@ -338,7 +342,7 @@ }, { "cell_type": "markdown", - "id": "1dfeef0f", + "id": "14cc596c", "metadata": { "lines_to_next_cell": 2 }, @@ -346,7 +350,7 @@ }, { "cell_type": "markdown", - "id": "540c1560", + "id": "7ce24cb6", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -363,7 +367,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c203e707", + "id": "05697bc3", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +377,7 @@ { "cell_type": "code", "execution_count": null, - "id": "477a82bf", + "id": "48d9675a", "metadata": {}, "outputs": [], "source": [ @@ -384,7 +388,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4add7bae", + "id": "3f8d9f72", "metadata": {}, "outputs": [], "source": [ @@ -394,7 +398,7 @@ }, { "cell_type": "markdown", - "id": "2add3b7e", + "id": "b4df79e4", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -407,7 +411,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9420b94", + "id": "4d52876b", "metadata": {}, "outputs": [], "source": [ @@ -417,7 +421,7 @@ }, { "cell_type": "markdown", - "id": "2561626c", + "id": "4b65245e", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -428,7 +432,7 @@ { "cell_type": "code", "execution_count": null, - "id": "da172f7a", + "id": "8853ae73", "metadata": { "lines_to_next_cell": 2 }, @@ -452,7 +456,7 @@ }, { "cell_type": "markdown", - "id": "d55e0dcf", + "id": "843e982e", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -465,7 +469,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3ebc2769", + "id": "9684a964", "metadata": {}, "outputs": [], "source": [ @@ -475,7 +479,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cc034afb", + "id": "4977d299", "metadata": {}, "outputs": [], "source": [ @@ -488,7 +492,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b769554e", + "id": "4b13cfdd", "metadata": {}, "outputs": [], "source": [ @@ -500,7 +504,7 @@ }, { "cell_type": "markdown", - "id": "9d19e830", + "id": "11d0f65a", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py index 01fc65f1..fab89c34 100644 --- a/docs/scripts/generate_colab_notebooks.py +++ b/docs/scripts/generate_colab_notebooks.py @@ -24,7 +24,7 @@ COLAB_SETUP_MARKDOWN = """\ ### ⚑ Colab Setup -Run the cells below to install the dependencies and set up the API key. +Run the cells below to install the dependencies and set up the API key. If you don't have an API key, you can generate one from [build.nvidia.com](https://build.nvidia.com). """ ADDITIONAL_DEPENDENCIES = { @@ -35,21 +35,31 @@ !pip install -q data-designer>={version}{deps}""" COLAB_API_KEY_CELL = """\ +import getpass import os from google.colab import userdata -os.environ["NVIDIA_API_KEY"] = userdata.get("NVIDIA_API_KEY")""" +try: + os.environ["NVIDIA_API_KEY"] = userdata.get("NVIDIA_API_KEY") +except userdata.SecretNotFoundError: + os.environ["NVIDIA_API_KEY"] = getpass.getpass("Enter your NVIDIA API key: ")""" def create_colab_setup_cells(additional_dependencies: str) -> list[NotebookNode]: """Create the Colab-specific setup cells to inject before imports.""" + dd_version_parts = [int(part) for part in data_designer_version.split(".")[:3]] + dd_version_parts[-1] -= 1 # current version is dev and already incremented + current_dd_version = ".".join(str(part) for part in dd_version_parts) + return [ new_markdown_cell(source=COLAB_SETUP_MARKDOWN), - new_code_cell(source=COLAB_INSTALL_CELL.format( - version=".".join(data_designer_version.split(".")[:3]), - deps=additional_dependencies, - )), + new_code_cell( + source=COLAB_INSTALL_CELL.format( + version=current_dd_version, + deps=additional_dependencies, + ) + ), new_code_cell(source=COLAB_API_KEY_CELL), ] From 18c1e99f3b3282af262b877ac62c842866fcbd62 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Fri, 12 Dec 2025 11:19:56 -0300 Subject: [PATCH 10/14] trying again --- .github/workflows/check-colab-notebooks.yml | 2 + docs/colab_notebooks/1-the-basics.ipynb | 62 ++++++++--------- ...ctured-outputs-and-jinja-expressions.ipynb | 58 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 54 +++++++-------- .../4-providing-images-as-context.ipynb | 68 +++++++++---------- 5 files changed, 123 insertions(+), 121 deletions(-) diff --git a/.github/workflows/check-colab-notebooks.yml b/.github/workflows/check-colab-notebooks.yml index 7116b4a0..13dab855 100644 --- a/.github/workflows/check-colab-notebooks.yml +++ b/.github/workflows/check-colab-notebooks.yml @@ -19,6 +19,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Install uv uses: astral-sh/setup-uv@v5 diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index c096fae0..7679af06 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "e5e6a125", + "id": "6dd7e91b", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "3624a628", + "id": "b0fcabe0", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -25,7 +25,7 @@ { "cell_type": "code", "execution_count": null, - "id": "06637f1c", + "id": "26150284", "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b22aaac1", + "id": "afc3fb2c", "metadata": {}, "outputs": [], "source": [ @@ -52,7 +52,7 @@ }, { "cell_type": "markdown", - "id": "3480b8ce", + "id": "a2e17bd2", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +63,7 @@ { "cell_type": "code", "execution_count": null, - "id": "57b88371", + "id": "346eec1f", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "e395b791", + "id": "a95a5414", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "074bbb9d", + "id": "7971d997", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "0991c1fb", + "id": "75ac1115", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -123,7 +123,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9611ea0", + "id": "3cfea0c7", "metadata": {}, "outputs": [], "source": [ @@ -155,7 +155,7 @@ }, { "cell_type": "markdown", - "id": "c7cd421b", + "id": "a9808126", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -170,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2896ca4d", + "id": "5ea9ff73", "metadata": {}, "outputs": [], "source": [ @@ -179,7 +179,7 @@ }, { "cell_type": "markdown", - "id": "e2c417a1", + "id": "13148fd5", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e7a7fbc3", + "id": "7e054714", "metadata": {}, "outputs": [], "source": [ @@ -205,7 +205,7 @@ }, { "cell_type": "markdown", - "id": "4d272b48", + "id": "da4a52dd", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -214,7 +214,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a7884ce5", + "id": "9b8bccaf", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +295,7 @@ }, { "cell_type": "markdown", - "id": "b0c6c331", + "id": "12d1db15", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -304,7 +304,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c05ef860", + "id": "fe34e957", "metadata": {}, "outputs": [], "source": [ @@ -341,7 +341,7 @@ }, { "cell_type": "markdown", - "id": "47ad6af8", + "id": "767cb1fc", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -356,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3a2d9680", + "id": "9d1e1f93", "metadata": {}, "outputs": [], "source": [ @@ -393,7 +393,7 @@ }, { "cell_type": "markdown", - "id": "d7f99a8f", + "id": "ee0ace4e", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -410,7 +410,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e5637e52", + "id": "810a6925", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "50f9c5ca", + "id": "e0631940", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "20c293e8", + "id": "d30d6b1b", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "185f254b", + "id": "e6c6295b", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2e5e82df", + "id": "2bc57d1c", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ }, { "cell_type": "markdown", - "id": "5c685e44", + "id": "86d7528e", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7a96c763", + "id": "c06c5c3f", "metadata": {}, "outputs": [], "source": [ @@ -487,7 +487,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0aef9021", + "id": "5abbb3e7", "metadata": {}, "outputs": [], "source": [ @@ -500,7 +500,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39a7932a", + "id": "5c7e5706", "metadata": {}, "outputs": [], "source": [ @@ -512,7 +512,7 @@ }, { "cell_type": "markdown", - "id": "f329ebf9", + "id": "f596f54b", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index ebfa43e1..5f35ec53 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "1429ff3c", + "id": "fc655fb9", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "afa0eed7", + "id": "3feb4e26", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,7 +27,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f11db2d2", + "id": "d14644d9", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e37da6da", + "id": "7a4e29fb", "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "markdown", - "id": "7b870a00", + "id": "1b9d6351", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d7f3d55b", + "id": "1ad282cb", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "9ec98c2a", + "id": "dc50c7d5", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3374c9ca", + "id": "d6b39808", "metadata": {}, "outputs": [], "source": [ @@ -108,7 +108,7 @@ }, { "cell_type": "markdown", - "id": "a7ba9915", + "id": "f9b53c6b", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -125,7 +125,7 @@ { "cell_type": "code", "execution_count": null, - "id": "97174b12", + "id": "ff4fceed", "metadata": {}, "outputs": [], "source": [ @@ -157,7 +157,7 @@ }, { "cell_type": "markdown", - "id": "146a50e6", + "id": "42105d28", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -172,7 +172,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bb746e7d", + "id": "a2ad2186", "metadata": {}, "outputs": [], "source": [ @@ -181,7 +181,7 @@ }, { "cell_type": "markdown", - "id": "b9f8dd8b", + "id": "080789dd", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -208,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9d20677d", + "id": "214773d1", "metadata": {}, "outputs": [], "source": [ @@ -236,7 +236,7 @@ }, { "cell_type": "markdown", - "id": "dc580c93", + "id": "7eecc55a", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -245,7 +245,7 @@ { "cell_type": "code", "execution_count": null, - "id": "677921e9", + "id": "ae18dc2b", "metadata": {}, "outputs": [], "source": [ @@ -354,7 +354,7 @@ }, { "cell_type": "markdown", - "id": "d9ea22bf", + "id": "d4273457", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -371,7 +371,7 @@ { "cell_type": "code", "execution_count": null, - "id": "abfa418a", + "id": "4999ecca", "metadata": {}, "outputs": [], "source": [ @@ -425,7 +425,7 @@ }, { "cell_type": "markdown", - "id": "0129dd86", + "id": "b3d53148", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -442,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7adee26e", + "id": "efd6a266", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36fa3c62", + "id": "b580e182", "metadata": {}, "outputs": [], "source": [ @@ -463,7 +463,7 @@ { "cell_type": "code", "execution_count": null, - "id": "67bc0a68", + "id": "ea98c3df", "metadata": {}, "outputs": [], "source": [ @@ -473,7 +473,7 @@ }, { "cell_type": "markdown", - "id": "4dc3c12c", + "id": "b1f43d6a", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -486,7 +486,7 @@ { "cell_type": "code", "execution_count": null, - "id": "31717c45", + "id": "c4127be9", "metadata": {}, "outputs": [], "source": [ @@ -496,7 +496,7 @@ }, { "cell_type": "markdown", - "id": "27911ce8", + "id": "38d649ab", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -509,7 +509,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c2167fc", + "id": "3f0d0695", "metadata": {}, "outputs": [], "source": [ @@ -519,7 +519,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5f170797", + "id": "bca22085", "metadata": {}, "outputs": [], "source": [ @@ -532,7 +532,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cdc8f67b", + "id": "d7b26139", "metadata": {}, "outputs": [], "source": [ @@ -544,7 +544,7 @@ }, { "cell_type": "markdown", - "id": "0c1a4900", + "id": "c3d0d966", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index db560090..eddaffbe 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "40de3ce2", + "id": "5c8fc975", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "ade9e4d1", + "id": "55ee6b2d", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,7 +27,7 @@ { "cell_type": "code", "execution_count": null, - "id": "001db352", + "id": "dec57db5", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "68c77018", + "id": "2ef75edf", "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "markdown", - "id": "d7c03ce9", + "id": "df6262d6", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "049baf76", + "id": "f0ce1068", "metadata": {}, "outputs": [], "source": [ @@ -80,7 +80,7 @@ }, { "cell_type": "markdown", - "id": "98e7a674", + "id": "4846a735", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -93,7 +93,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d3817dd2", + "id": "883f9b05", "metadata": {}, "outputs": [], "source": [ @@ -102,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "da5488bb", + "id": "3762ea9d", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -119,7 +119,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51c2675f", + "id": "57253d47", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "c0c7a10e", + "id": "ffad7af6", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -166,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a8aa8ab1", + "id": "0814365d", "metadata": {}, "outputs": [], "source": [ @@ -175,7 +175,7 @@ }, { "cell_type": "markdown", - "id": "76a3082a", + "id": "5c7743a9", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -200,7 +200,7 @@ { "cell_type": "code", "execution_count": null, - "id": "be917d7a", + "id": "f2f6fa9f", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ }, { "cell_type": "markdown", - "id": "9aef6903", + "id": "7b0f5774", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a3b2598a", + "id": "bb3d141c", "metadata": {}, "outputs": [], "source": [ @@ -325,7 +325,7 @@ }, { "cell_type": "markdown", - "id": "b9410f6a", + "id": "78541a03", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -342,7 +342,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5ed93859", + "id": "a93ddfbb", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "702e1f9b", + "id": "d0fdeb63", "metadata": {}, "outputs": [], "source": [ @@ -363,7 +363,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eda5506a", + "id": "9ed717b6", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ }, { "cell_type": "markdown", - "id": "bb355499", + "id": "0d8757b7", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -386,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "71835cd6", + "id": "eea399db", "metadata": {}, "outputs": [], "source": [ @@ -396,7 +396,7 @@ }, { "cell_type": "markdown", - "id": "e3f1f1b0", + "id": "e68d8cc7", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c45c2641", + "id": "e1956a82", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aa817871", + "id": "c949473c", "metadata": {}, "outputs": [], "source": [ @@ -432,7 +432,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0e96493b", + "id": "f9e759c4", "metadata": {}, "outputs": [], "source": [ @@ -444,7 +444,7 @@ }, { "cell_type": "markdown", - "id": "a083ed48", + "id": "d705f411", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 4fe977fb..34a34f66 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "809f5cfb", + "id": "75adfdca", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "bcbeea27", + "id": "b1a35fc7", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "8cba0a51", + "id": "066943db", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -36,7 +36,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d2b68d0b", + "id": "08d33f4d", "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d9ffdf48", + "id": "ae5b19c9", "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ }, { "cell_type": "markdown", - "id": "6673f9ea", + "id": "db89c9ed", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -74,7 +74,7 @@ { "cell_type": "code", "execution_count": null, - "id": "10031076", + "id": "66985142", "metadata": {}, "outputs": [], "source": [ @@ -105,7 +105,7 @@ }, { "cell_type": "markdown", - "id": "aa2128d0", + "id": "59fc1895", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -118,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd925eca", + "id": "59d34a45", "metadata": {}, "outputs": [], "source": [ @@ -127,7 +127,7 @@ }, { "cell_type": "markdown", - "id": "e9edeee0", + "id": "60ea632e", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -144,7 +144,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26e59d13", + "id": "5f1e4145", "metadata": {}, "outputs": [], "source": [ @@ -167,7 +167,7 @@ }, { "cell_type": "markdown", - "id": "8c24afd7", + "id": "172662a0", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -182,7 +182,7 @@ { "cell_type": "code", "execution_count": null, - "id": "555eb2f3", + "id": "0cad1986", "metadata": {}, "outputs": [], "source": [ @@ -191,7 +191,7 @@ }, { "cell_type": "markdown", - "id": "7a87a2ec", + "id": "24de8948", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -208,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d95a5705", + "id": "7c7c2ba0", "metadata": {}, "outputs": [], "source": [ @@ -223,7 +223,7 @@ { "cell_type": "code", "execution_count": null, - "id": "de2624c1", + "id": "ce431f01", "metadata": {}, "outputs": [], "source": [ @@ -271,7 +271,7 @@ { "cell_type": "code", "execution_count": null, - "id": "79886f68", + "id": "41a18a82", "metadata": {}, "outputs": [], "source": [ @@ -289,7 +289,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef8b5065", + "id": "56e7ed9c", "metadata": {}, "outputs": [], "source": [ @@ -299,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "97d977af", + "id": "2c59dd7e", "metadata": {}, "outputs": [], "source": [ @@ -313,7 +313,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d8338041", + "id": "92c6dc75", "metadata": { "lines_to_next_cell": 2 }, @@ -342,7 +342,7 @@ }, { "cell_type": "markdown", - "id": "14cc596c", + "id": "d83a7369", "metadata": { "lines_to_next_cell": 2 }, @@ -350,7 +350,7 @@ }, { "cell_type": "markdown", - "id": "7ce24cb6", + "id": "cf6cf8ca", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -367,7 +367,7 @@ { "cell_type": "code", "execution_count": null, - "id": "05697bc3", + "id": "e62d3c8d", "metadata": {}, "outputs": [], "source": [ @@ -377,7 +377,7 @@ { "cell_type": "code", "execution_count": null, - "id": "48d9675a", + "id": "80485f13", "metadata": {}, "outputs": [], "source": [ @@ -388,7 +388,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3f8d9f72", + "id": "207a545a", "metadata": {}, "outputs": [], "source": [ @@ -398,7 +398,7 @@ }, { "cell_type": "markdown", - "id": "b4df79e4", + "id": "d873ef3f", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -411,7 +411,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4d52876b", + "id": "424d98c0", "metadata": {}, "outputs": [], "source": [ @@ -421,7 +421,7 @@ }, { "cell_type": "markdown", - "id": "4b65245e", + "id": "dcdb8197", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -432,7 +432,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8853ae73", + "id": "93f84a52", "metadata": { "lines_to_next_cell": 2 }, @@ -456,7 +456,7 @@ }, { "cell_type": "markdown", - "id": "843e982e", + "id": "da7ce8d2", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -469,7 +469,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9684a964", + "id": "c71f77a7", "metadata": {}, "outputs": [], "source": [ @@ -479,7 +479,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4977d299", + "id": "282da6b7", "metadata": {}, "outputs": [], "source": [ @@ -492,7 +492,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4b13cfdd", + "id": "09214b2d", "metadata": {}, "outputs": [], "source": [ @@ -504,7 +504,7 @@ }, { "cell_type": "markdown", - "id": "11d0f65a", + "id": "a2ea6c50", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", From eb35f5fcd4cc0e24a95e006b79637c2b570c042e Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Fri, 12 Dec 2025 13:37:41 -0300 Subject: [PATCH 11/14] button only for latest version --- .github/workflows/check-colab-notebooks.yml | 2 - docs/colab_notebooks/1-the-basics.ipynb | 64 +++++++-------- ...ctured-outputs-and-jinja-expressions.ipynb | 60 +++++++------- .../3-seeding-with-a-dataset.ipynb | 56 ++++++------- .../4-providing-images-as-context.ipynb | 80 +++++++++++-------- docs/overrides/main.html | 18 ++++- docs/scripts/generate_colab_notebooks.py | 30 +++---- 7 files changed, 161 insertions(+), 149 deletions(-) diff --git a/.github/workflows/check-colab-notebooks.yml b/.github/workflows/check-colab-notebooks.yml index 13dab855..7116b4a0 100644 --- a/.github/workflows/check-colab-notebooks.yml +++ b/.github/workflows/check-colab-notebooks.yml @@ -19,8 +19,6 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 - with: - fetch-depth: 0 - name: Install uv uses: astral-sh/setup-uv@v5 diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index 7679af06..f897396b 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "6dd7e91b", + "id": "08652297", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "b0fcabe0", + "id": "56453533", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -25,17 +25,17 @@ { "cell_type": "code", "execution_count": null, - "id": "26150284", + "id": "1bebd037", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer>=0.1.5" + "!pip install -q data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "afc3fb2c", + "id": "4a80501c", "metadata": {}, "outputs": [], "source": [ @@ -52,7 +52,7 @@ }, { "cell_type": "markdown", - "id": "a2e17bd2", + "id": "e1197e2f", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +63,7 @@ { "cell_type": "code", "execution_count": null, - "id": "346eec1f", + "id": "c84d7f47", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "a95a5414", + "id": "4f7cd0e0", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7971d997", + "id": "7d2a3a60", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "75ac1115", + "id": "4239c705", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -123,7 +123,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3cfea0c7", + "id": "dbba56d6", "metadata": {}, "outputs": [], "source": [ @@ -155,7 +155,7 @@ }, { "cell_type": "markdown", - "id": "a9808126", + "id": "b1d7bf14", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -170,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5ea9ff73", + "id": "4523896b", "metadata": {}, "outputs": [], "source": [ @@ -179,7 +179,7 @@ }, { "cell_type": "markdown", - "id": "13148fd5", + "id": "9f808014", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7e054714", + "id": "32c3b1f7", "metadata": {}, "outputs": [], "source": [ @@ -205,7 +205,7 @@ }, { "cell_type": "markdown", - "id": "da4a52dd", + "id": "3ebd008d", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -214,7 +214,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b8bccaf", + "id": "70aafa95", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +295,7 @@ }, { "cell_type": "markdown", - "id": "12d1db15", + "id": "e580194b", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -304,7 +304,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe34e957", + "id": "64e17656", "metadata": {}, "outputs": [], "source": [ @@ -341,7 +341,7 @@ }, { "cell_type": "markdown", - "id": "767cb1fc", + "id": "b308d402", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -356,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9d1e1f93", + "id": "90e9079e", "metadata": {}, "outputs": [], "source": [ @@ -393,7 +393,7 @@ }, { "cell_type": "markdown", - "id": "ee0ace4e", + "id": "bdc0bb9d", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -410,7 +410,7 @@ { "cell_type": "code", "execution_count": null, - "id": "810a6925", + "id": "7796b12c", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e0631940", + "id": "f4b1f7ce", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d30d6b1b", + "id": "8c4b13b2", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "e6c6295b", + "id": "d948303e", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2bc57d1c", + "id": "e431bd3f", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ }, { "cell_type": "markdown", - "id": "86d7528e", + "id": "8a0eff6e", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c06c5c3f", + "id": "fe4297f6", "metadata": {}, "outputs": [], "source": [ @@ -487,7 +487,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5abbb3e7", + "id": "acf9c8b0", "metadata": {}, "outputs": [], "source": [ @@ -500,7 +500,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5c7e5706", + "id": "2544e10b", "metadata": {}, "outputs": [], "source": [ @@ -512,7 +512,7 @@ }, { "cell_type": "markdown", - "id": "f596f54b", + "id": "d3defc17", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 5f35ec53..6f00e3d4 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "fc655fb9", + "id": "b21c4a12", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "3feb4e26", + "id": "c4584bdd", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,17 +27,17 @@ { "cell_type": "code", "execution_count": null, - "id": "d14644d9", + "id": "ec958990", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer>=0.1.5" + "!pip install -q data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "7a4e29fb", + "id": "a6bd19ba", "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "markdown", - "id": "1b9d6351", + "id": "1c59c24a", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1ad282cb", + "id": "3498a0f3", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "dc50c7d5", + "id": "ff6a567e", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d6b39808", + "id": "795c6336", "metadata": {}, "outputs": [], "source": [ @@ -108,7 +108,7 @@ }, { "cell_type": "markdown", - "id": "f9b53c6b", + "id": "f91d0f1e", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -125,7 +125,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ff4fceed", + "id": "efd7c908", "metadata": {}, "outputs": [], "source": [ @@ -157,7 +157,7 @@ }, { "cell_type": "markdown", - "id": "42105d28", + "id": "c72f57d3", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -172,7 +172,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a2ad2186", + "id": "eecdbd48", "metadata": {}, "outputs": [], "source": [ @@ -181,7 +181,7 @@ }, { "cell_type": "markdown", - "id": "080789dd", + "id": "5d839066", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -208,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "214773d1", + "id": "92385cbd", "metadata": {}, "outputs": [], "source": [ @@ -236,7 +236,7 @@ }, { "cell_type": "markdown", - "id": "7eecc55a", + "id": "ca8768b3", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -245,7 +245,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ae18dc2b", + "id": "fe04a2ff", "metadata": {}, "outputs": [], "source": [ @@ -354,7 +354,7 @@ }, { "cell_type": "markdown", - "id": "d4273457", + "id": "705b8f82", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -371,7 +371,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4999ecca", + "id": "178b4b52", "metadata": {}, "outputs": [], "source": [ @@ -425,7 +425,7 @@ }, { "cell_type": "markdown", - "id": "b3d53148", + "id": "52429447", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -442,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "efd6a266", + "id": "a4e0c81d", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b580e182", + "id": "efd81057", "metadata": {}, "outputs": [], "source": [ @@ -463,7 +463,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ea98c3df", + "id": "d7217232", "metadata": {}, "outputs": [], "source": [ @@ -473,7 +473,7 @@ }, { "cell_type": "markdown", - "id": "b1f43d6a", + "id": "0c3c16ce", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -486,7 +486,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c4127be9", + "id": "9f7c417e", "metadata": {}, "outputs": [], "source": [ @@ -496,7 +496,7 @@ }, { "cell_type": "markdown", - "id": "38d649ab", + "id": "8b3aafb9", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -509,7 +509,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3f0d0695", + "id": "b8092031", "metadata": {}, "outputs": [], "source": [ @@ -519,7 +519,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bca22085", + "id": "3c5b7f83", "metadata": {}, "outputs": [], "source": [ @@ -532,7 +532,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d7b26139", + "id": "0d566d53", "metadata": {}, "outputs": [], "source": [ @@ -544,7 +544,7 @@ }, { "cell_type": "markdown", - "id": "c3d0d966", + "id": "fd5c1377", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index eddaffbe..1f012250 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "5c8fc975", + "id": "f78c3264", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "55ee6b2d", + "id": "becb0165", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,17 +27,17 @@ { "cell_type": "code", "execution_count": null, - "id": "dec57db5", + "id": "a317a0a6", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer>=0.1.5" + "!pip install -q data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "2ef75edf", + "id": "82122fb6", "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "markdown", - "id": "df6262d6", + "id": "9f918bb8", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f0ce1068", + "id": "8b2ac5d4", "metadata": {}, "outputs": [], "source": [ @@ -80,7 +80,7 @@ }, { "cell_type": "markdown", - "id": "4846a735", + "id": "f6bdb412", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -93,7 +93,7 @@ { "cell_type": "code", "execution_count": null, - "id": "883f9b05", + "id": "d8e966ae", "metadata": {}, "outputs": [], "source": [ @@ -102,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "3762ea9d", + "id": "b14c27a7", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -119,7 +119,7 @@ { "cell_type": "code", "execution_count": null, - "id": "57253d47", + "id": "d30c9613", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "ffad7af6", + "id": "80234300", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -166,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0814365d", + "id": "3bd9867b", "metadata": {}, "outputs": [], "source": [ @@ -175,7 +175,7 @@ }, { "cell_type": "markdown", - "id": "5c7743a9", + "id": "e3e49033", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -200,7 +200,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f2f6fa9f", + "id": "d5efc62c", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ }, { "cell_type": "markdown", - "id": "7b0f5774", + "id": "a24b6158", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bb3d141c", + "id": "28582202", "metadata": {}, "outputs": [], "source": [ @@ -325,7 +325,7 @@ }, { "cell_type": "markdown", - "id": "78541a03", + "id": "36966929", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -342,7 +342,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a93ddfbb", + "id": "6927b4c6", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d0fdeb63", + "id": "25a5a9a5", "metadata": {}, "outputs": [], "source": [ @@ -363,7 +363,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ed717b6", + "id": "40148f78", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ }, { "cell_type": "markdown", - "id": "0d8757b7", + "id": "c6922c71", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -386,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eea399db", + "id": "91ecf297", "metadata": {}, "outputs": [], "source": [ @@ -396,7 +396,7 @@ }, { "cell_type": "markdown", - "id": "e68d8cc7", + "id": "0f2d6ebb", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e1956a82", + "id": "1a6ac78a", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c949473c", + "id": "3ab98c20", "metadata": {}, "outputs": [], "source": [ @@ -432,7 +432,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9e759c4", + "id": "34c97828", "metadata": {}, "outputs": [], "source": [ @@ -444,7 +444,7 @@ }, { "cell_type": "markdown", - "id": "d705f411", + "id": "e85dcb31", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 34a34f66..5a0da0cd 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "75adfdca", + "id": "f0a655e1", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "b1a35fc7", + "id": "ebaa76d5", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "066943db", + "id": "5ae77856", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -36,17 +36,27 @@ { "cell_type": "code", "execution_count": null, - "id": "08d33f4d", + "id": "7e1776e3", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer>=0.1.5 pillow>=12.0.0" + "!pip install -q data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "ae5b19c9", + "id": "b00e80af", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q pillow>=12.0.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b3d1f25", "metadata": {}, "outputs": [], "source": [ @@ -63,7 +73,7 @@ }, { "cell_type": "markdown", - "id": "db89c9ed", + "id": "fbcb7d19", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -74,7 +84,7 @@ { "cell_type": "code", "execution_count": null, - "id": "66985142", + "id": "df240606", "metadata": {}, "outputs": [], "source": [ @@ -105,7 +115,7 @@ }, { "cell_type": "markdown", - "id": "59fc1895", + "id": "b8468ab3", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -118,7 +128,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59d34a45", + "id": "80744554", "metadata": {}, "outputs": [], "source": [ @@ -127,7 +137,7 @@ }, { "cell_type": "markdown", - "id": "60ea632e", + "id": "07c8bba9", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -144,7 +154,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5f1e4145", + "id": "debb915a", "metadata": {}, "outputs": [], "source": [ @@ -167,7 +177,7 @@ }, { "cell_type": "markdown", - "id": "172662a0", + "id": "a1a5d4d6", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -182,7 +192,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0cad1986", + "id": "ec8cccfb", "metadata": {}, "outputs": [], "source": [ @@ -191,7 +201,7 @@ }, { "cell_type": "markdown", - "id": "24de8948", + "id": "5eb9043c", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -208,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c7c2ba0", + "id": "4488c772", "metadata": {}, "outputs": [], "source": [ @@ -223,7 +233,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ce431f01", + "id": "acb0c0a5", "metadata": {}, "outputs": [], "source": [ @@ -271,7 +281,7 @@ { "cell_type": "code", "execution_count": null, - "id": "41a18a82", + "id": "3877ebc0", "metadata": {}, "outputs": [], "source": [ @@ -289,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "56e7ed9c", + "id": "f550fae3", "metadata": {}, "outputs": [], "source": [ @@ -299,7 +309,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2c59dd7e", + "id": "c58801b5", "metadata": {}, "outputs": [], "source": [ @@ -313,7 +323,7 @@ { "cell_type": "code", "execution_count": null, - "id": "92c6dc75", + "id": "213807d0", "metadata": { "lines_to_next_cell": 2 }, @@ -342,7 +352,7 @@ }, { "cell_type": "markdown", - "id": "d83a7369", + "id": "6d4138a6", "metadata": { "lines_to_next_cell": 2 }, @@ -350,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "cf6cf8ca", + "id": "1e7a0d4c", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -367,7 +377,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e62d3c8d", + "id": "84ad88ca", "metadata": {}, "outputs": [], "source": [ @@ -377,7 +387,7 @@ { "cell_type": "code", "execution_count": null, - "id": "80485f13", + "id": "39f2467e", "metadata": {}, "outputs": [], "source": [ @@ -388,7 +398,7 @@ { "cell_type": "code", "execution_count": null, - "id": "207a545a", + "id": "9b6f6e49", "metadata": {}, "outputs": [], "source": [ @@ -398,7 +408,7 @@ }, { "cell_type": "markdown", - "id": "d873ef3f", + "id": "08075e2a", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -411,7 +421,7 @@ { "cell_type": "code", "execution_count": null, - "id": "424d98c0", + "id": "e563d826", "metadata": {}, "outputs": [], "source": [ @@ -421,7 +431,7 @@ }, { "cell_type": "markdown", - "id": "dcdb8197", + "id": "79679d0c", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -432,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "93f84a52", + "id": "99ad3f66", "metadata": { "lines_to_next_cell": 2 }, @@ -456,7 +466,7 @@ }, { "cell_type": "markdown", - "id": "da7ce8d2", + "id": "aee25107", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -469,7 +479,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c71f77a7", + "id": "2a665137", "metadata": {}, "outputs": [], "source": [ @@ -479,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "282da6b7", + "id": "78226714", "metadata": {}, "outputs": [], "source": [ @@ -492,7 +502,7 @@ { "cell_type": "code", "execution_count": null, - "id": "09214b2d", + "id": "58f862fd", "metadata": {}, "outputs": [], "source": [ @@ -504,7 +514,7 @@ }, { "cell_type": "markdown", - "id": "a2ea6c50", + "id": "d9e8b66b", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/overrides/main.html b/docs/overrides/main.html index 18759076..be7a7c51 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -9,12 +9,22 @@ {% block content %} {% if page.nb_url %} +
- Open In Colab - - - {% include ".icons/material/download.svg" %} + Open In Colab + +
+ + {% include ".icons/material/download.svg" %} + {% endif %} {{ super() }} diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py index fab89c34..87110b87 100644 --- a/docs/scripts/generate_colab_notebooks.py +++ b/docs/scripts/generate_colab_notebooks.py @@ -17,8 +17,6 @@ from nbformat import NotebookNode from nbformat.v4 import new_code_cell, new_markdown_cell -from data_designer import __version__ as data_designer_version - IMPORT_SECTION_MARKER = "### πŸ“¦ Import the essentials" COLAB_SETUP_MARKDOWN = """\ @@ -28,11 +26,14 @@ """ ADDITIONAL_DEPENDENCIES = { - "4-providing-images-as-context.py": " pillow>=12.0.0", + "4-providing-images-as-context.py": "pillow>=12.0.0", } COLAB_INSTALL_CELL = """\ -!pip install -q data-designer>={version}{deps}""" +!pip install -q data-designer""" + +COLAB_DEPENDENCIES_CELL = """\ +!pip install -q {deps}""" COLAB_API_KEY_CELL = """\ import getpass @@ -48,20 +49,13 @@ def create_colab_setup_cells(additional_dependencies: str) -> list[NotebookNode]: """Create the Colab-specific setup cells to inject before imports.""" - dd_version_parts = [int(part) for part in data_designer_version.split(".")[:3]] - dd_version_parts[-1] -= 1 # current version is dev and already incremented - current_dd_version = ".".join(str(part) for part in dd_version_parts) - - return [ - new_markdown_cell(source=COLAB_SETUP_MARKDOWN), - new_code_cell( - source=COLAB_INSTALL_CELL.format( - version=current_dd_version, - deps=additional_dependencies, - ) - ), - new_code_cell(source=COLAB_API_KEY_CELL), - ] + cells = [] + cells += [new_markdown_cell(source=COLAB_SETUP_MARKDOWN)] + cells += [new_code_cell(source=COLAB_INSTALL_CELL)] + if additional_dependencies: + cells += [new_code_cell(source=COLAB_DEPENDENCIES_CELL.format(deps=additional_dependencies))] + cells += [new_code_cell(source=COLAB_API_KEY_CELL)] + return cells def find_import_section_index(cells: list[NotebookNode]) -> int: From 2d59da0fb0efb9ed156db8db27ae3869fad764ec Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Fri, 12 Dec 2025 13:38:51 -0300 Subject: [PATCH 12/14] forgot -U --- docs/scripts/generate_colab_notebooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py index 87110b87..34a43392 100644 --- a/docs/scripts/generate_colab_notebooks.py +++ b/docs/scripts/generate_colab_notebooks.py @@ -30,7 +30,7 @@ } COLAB_INSTALL_CELL = """\ -!pip install -q data-designer""" +!pip install -qU data-designer""" COLAB_DEPENDENCIES_CELL = """\ !pip install -q {deps}""" From bc27e1d39194990012dd02952d79efed5803d7aa Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Fri, 12 Dec 2025 13:39:05 -0300 Subject: [PATCH 13/14] updating notebooks --- docs/colab_notebooks/1-the-basics.ipynb | 64 ++++++++--------- ...ctured-outputs-and-jinja-expressions.ipynb | 60 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 56 +++++++-------- .../4-providing-images-as-context.ipynb | 72 +++++++++---------- 4 files changed, 126 insertions(+), 126 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index f897396b..a132d691 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "08652297", + "id": "c95de131", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "56453533", + "id": "b4c9635c", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -25,17 +25,17 @@ { "cell_type": "code", "execution_count": null, - "id": "1bebd037", + "id": "32b06b7f", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer" + "!pip install -qU data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "4a80501c", + "id": "d4b2540d", "metadata": {}, "outputs": [], "source": [ @@ -52,7 +52,7 @@ }, { "cell_type": "markdown", - "id": "e1197e2f", + "id": "090d5b4d", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +63,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c84d7f47", + "id": "81d9e5a1", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "4f7cd0e0", + "id": "004ea5a1", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d2a3a60", + "id": "fe6a6d01", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "4239c705", + "id": "e15e3fe6", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -123,7 +123,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dbba56d6", + "id": "70e94897", "metadata": {}, "outputs": [], "source": [ @@ -155,7 +155,7 @@ }, { "cell_type": "markdown", - "id": "b1d7bf14", + "id": "84426488", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -170,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4523896b", + "id": "fe15fafd", "metadata": {}, "outputs": [], "source": [ @@ -179,7 +179,7 @@ }, { "cell_type": "markdown", - "id": "9f808014", + "id": "a079da7b", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32c3b1f7", + "id": "b2268fab", "metadata": {}, "outputs": [], "source": [ @@ -205,7 +205,7 @@ }, { "cell_type": "markdown", - "id": "3ebd008d", + "id": "77804a37", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -214,7 +214,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70aafa95", + "id": "cfdca13c", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +295,7 @@ }, { "cell_type": "markdown", - "id": "e580194b", + "id": "61b952e4", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -304,7 +304,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64e17656", + "id": "c890342d", "metadata": {}, "outputs": [], "source": [ @@ -341,7 +341,7 @@ }, { "cell_type": "markdown", - "id": "b308d402", + "id": "de601ca2", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -356,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "90e9079e", + "id": "ed6e2c88", "metadata": {}, "outputs": [], "source": [ @@ -393,7 +393,7 @@ }, { "cell_type": "markdown", - "id": "bdc0bb9d", + "id": "cfa7a171", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -410,7 +410,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7796b12c", + "id": "d0a2c8fe", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f4b1f7ce", + "id": "d56cc305", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8c4b13b2", + "id": "a923e1f8", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "d948303e", + "id": "87b434bb", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e431bd3f", + "id": "64dd6ed3", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ }, { "cell_type": "markdown", - "id": "8a0eff6e", + "id": "449df416", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe4297f6", + "id": "6c2deda4", "metadata": {}, "outputs": [], "source": [ @@ -487,7 +487,7 @@ { "cell_type": "code", "execution_count": null, - "id": "acf9c8b0", + "id": "7e41c983", "metadata": {}, "outputs": [], "source": [ @@ -500,7 +500,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2544e10b", + "id": "d5268407", "metadata": {}, "outputs": [], "source": [ @@ -512,7 +512,7 @@ }, { "cell_type": "markdown", - "id": "d3defc17", + "id": "874083d2", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 6f00e3d4..7e50b9d4 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "b21c4a12", + "id": "62d29d7b", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "c4584bdd", + "id": "c10e813d", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,17 +27,17 @@ { "cell_type": "code", "execution_count": null, - "id": "ec958990", + "id": "e0db7e3b", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer" + "!pip install -qU data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "a6bd19ba", + "id": "ce081afb", "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "markdown", - "id": "1c59c24a", + "id": "2cb5e676", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3498a0f3", + "id": "f7020536", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "ff6a567e", + "id": "37d5768b", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "795c6336", + "id": "e5cf138c", "metadata": {}, "outputs": [], "source": [ @@ -108,7 +108,7 @@ }, { "cell_type": "markdown", - "id": "f91d0f1e", + "id": "d614fac6", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -125,7 +125,7 @@ { "cell_type": "code", "execution_count": null, - "id": "efd7c908", + "id": "dfbd7bae", "metadata": {}, "outputs": [], "source": [ @@ -157,7 +157,7 @@ }, { "cell_type": "markdown", - "id": "c72f57d3", + "id": "fdb0b22f", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -172,7 +172,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eecdbd48", + "id": "af64349c", "metadata": {}, "outputs": [], "source": [ @@ -181,7 +181,7 @@ }, { "cell_type": "markdown", - "id": "5d839066", + "id": "c07f592f", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -208,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "92385cbd", + "id": "1b9ae5f1", "metadata": {}, "outputs": [], "source": [ @@ -236,7 +236,7 @@ }, { "cell_type": "markdown", - "id": "ca8768b3", + "id": "8893aaf1", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -245,7 +245,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe04a2ff", + "id": "4b78e470", "metadata": {}, "outputs": [], "source": [ @@ -354,7 +354,7 @@ }, { "cell_type": "markdown", - "id": "705b8f82", + "id": "c0d2e97a", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -371,7 +371,7 @@ { "cell_type": "code", "execution_count": null, - "id": "178b4b52", + "id": "79625928", "metadata": {}, "outputs": [], "source": [ @@ -425,7 +425,7 @@ }, { "cell_type": "markdown", - "id": "52429447", + "id": "7df7b4af", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -442,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a4e0c81d", + "id": "e43873ef", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "efd81057", + "id": "d211b3df", "metadata": {}, "outputs": [], "source": [ @@ -463,7 +463,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d7217232", + "id": "5a7f2e4c", "metadata": {}, "outputs": [], "source": [ @@ -473,7 +473,7 @@ }, { "cell_type": "markdown", - "id": "0c3c16ce", + "id": "3b10d9e4", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -486,7 +486,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9f7c417e", + "id": "5b0bc4f2", "metadata": {}, "outputs": [], "source": [ @@ -496,7 +496,7 @@ }, { "cell_type": "markdown", - "id": "8b3aafb9", + "id": "a87e8b8d", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -509,7 +509,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b8092031", + "id": "92a3b906", "metadata": {}, "outputs": [], "source": [ @@ -519,7 +519,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c5b7f83", + "id": "48a153cf", "metadata": {}, "outputs": [], "source": [ @@ -532,7 +532,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d566d53", + "id": "53a22fbc", "metadata": {}, "outputs": [], "source": [ @@ -544,7 +544,7 @@ }, { "cell_type": "markdown", - "id": "fd5c1377", + "id": "e26365d8", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 1f012250..b163c45c 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "f78c3264", + "id": "ef3027fd", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "becb0165", + "id": "69a3dfd7", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,17 +27,17 @@ { "cell_type": "code", "execution_count": null, - "id": "a317a0a6", + "id": "2d93af83", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer" + "!pip install -qU data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "82122fb6", + "id": "26a826f9", "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "markdown", - "id": "9f918bb8", + "id": "45f7ec9f", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8b2ac5d4", + "id": "c9753116", "metadata": {}, "outputs": [], "source": [ @@ -80,7 +80,7 @@ }, { "cell_type": "markdown", - "id": "f6bdb412", + "id": "11bed01d", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -93,7 +93,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d8e966ae", + "id": "d7d581d0", "metadata": {}, "outputs": [], "source": [ @@ -102,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "b14c27a7", + "id": "9213deef", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -119,7 +119,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d30c9613", + "id": "1de03c94", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "80234300", + "id": "d2eab698", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -166,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3bd9867b", + "id": "4cd6bb8d", "metadata": {}, "outputs": [], "source": [ @@ -175,7 +175,7 @@ }, { "cell_type": "markdown", - "id": "e3e49033", + "id": "736f4dc5", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -200,7 +200,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d5efc62c", + "id": "c6094cb7", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ }, { "cell_type": "markdown", - "id": "a24b6158", + "id": "f14c8397", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28582202", + "id": "589d865b", "metadata": {}, "outputs": [], "source": [ @@ -325,7 +325,7 @@ }, { "cell_type": "markdown", - "id": "36966929", + "id": "6625b19a", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -342,7 +342,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6927b4c6", + "id": "77854ddc", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "25a5a9a5", + "id": "b8231cc6", "metadata": {}, "outputs": [], "source": [ @@ -363,7 +363,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40148f78", + "id": "176855ed", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ }, { "cell_type": "markdown", - "id": "c6922c71", + "id": "07185bb3", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -386,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "91ecf297", + "id": "ea0ac5e3", "metadata": {}, "outputs": [], "source": [ @@ -396,7 +396,7 @@ }, { "cell_type": "markdown", - "id": "0f2d6ebb", + "id": "945ea8c0", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1a6ac78a", + "id": "149b509b", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3ab98c20", + "id": "0f1849e6", "metadata": {}, "outputs": [], "source": [ @@ -432,7 +432,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34c97828", + "id": "9442e3e2", "metadata": {}, "outputs": [], "source": [ @@ -444,7 +444,7 @@ }, { "cell_type": "markdown", - "id": "e85dcb31", + "id": "69e9831e", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 5a0da0cd..b9019ce3 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "f0a655e1", + "id": "e2dc539e", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "ebaa76d5", + "id": "037eb557", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "5ae77856", + "id": "a03c2c76", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -36,17 +36,17 @@ { "cell_type": "code", "execution_count": null, - "id": "7e1776e3", + "id": "4c804945", "metadata": {}, "outputs": [], "source": [ - "!pip install -q data-designer" + "!pip install -qU data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "b00e80af", + "id": "7d5b55a9", "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0b3d1f25", + "id": "d6d4560f", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ }, { "cell_type": "markdown", - "id": "fbcb7d19", + "id": "81cbdc29", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -84,7 +84,7 @@ { "cell_type": "code", "execution_count": null, - "id": "df240606", + "id": "9d046649", "metadata": {}, "outputs": [], "source": [ @@ -115,7 +115,7 @@ }, { "cell_type": "markdown", - "id": "b8468ab3", + "id": "2411260e", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -128,7 +128,7 @@ { "cell_type": "code", "execution_count": null, - "id": "80744554", + "id": "26fbb414", "metadata": {}, "outputs": [], "source": [ @@ -137,7 +137,7 @@ }, { "cell_type": "markdown", - "id": "07c8bba9", + "id": "a103a98c", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -154,7 +154,7 @@ { "cell_type": "code", "execution_count": null, - "id": "debb915a", + "id": "796b0217", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +177,7 @@ }, { "cell_type": "markdown", - "id": "a1a5d4d6", + "id": "acf04c1a", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -192,7 +192,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ec8cccfb", + "id": "fe51662d", "metadata": {}, "outputs": [], "source": [ @@ -201,7 +201,7 @@ }, { "cell_type": "markdown", - "id": "5eb9043c", + "id": "4120847f", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -218,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4488c772", + "id": "39e8f5b2", "metadata": {}, "outputs": [], "source": [ @@ -233,7 +233,7 @@ { "cell_type": "code", "execution_count": null, - "id": "acb0c0a5", + "id": "7931046e", "metadata": {}, "outputs": [], "source": [ @@ -281,7 +281,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3877ebc0", + "id": "fe1a3c34", "metadata": {}, "outputs": [], "source": [ @@ -299,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f550fae3", + "id": "1031bd73", "metadata": {}, "outputs": [], "source": [ @@ -309,7 +309,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c58801b5", + "id": "4000604f", "metadata": {}, "outputs": [], "source": [ @@ -323,7 +323,7 @@ { "cell_type": "code", "execution_count": null, - "id": "213807d0", + "id": "41996558", "metadata": { "lines_to_next_cell": 2 }, @@ -352,7 +352,7 @@ }, { "cell_type": "markdown", - "id": "6d4138a6", + "id": "92f765f7", "metadata": { "lines_to_next_cell": 2 }, @@ -360,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "1e7a0d4c", + "id": "0c78cf89", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -377,7 +377,7 @@ { "cell_type": "code", "execution_count": null, - "id": "84ad88ca", + "id": "8eac11ab", "metadata": {}, "outputs": [], "source": [ @@ -387,7 +387,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39f2467e", + "id": "9dbee16c", "metadata": {}, "outputs": [], "source": [ @@ -398,7 +398,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b6f6e49", + "id": "6540f043", "metadata": {}, "outputs": [], "source": [ @@ -408,7 +408,7 @@ }, { "cell_type": "markdown", - "id": "08075e2a", + "id": "a265c96c", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -421,7 +421,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e563d826", + "id": "7be20be6", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +431,7 @@ }, { "cell_type": "markdown", - "id": "79679d0c", + "id": "c72684f2", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -442,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "99ad3f66", + "id": "eb3cd699", "metadata": { "lines_to_next_cell": 2 }, @@ -466,7 +466,7 @@ }, { "cell_type": "markdown", - "id": "aee25107", + "id": "297bb3d7", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -479,7 +479,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2a665137", + "id": "e9ccd061", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78226714", + "id": "a9cc51f5", "metadata": {}, "outputs": [], "source": [ @@ -502,7 +502,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58f862fd", + "id": "218fb907", "metadata": {}, "outputs": [], "source": [ @@ -514,7 +514,7 @@ }, { "cell_type": "markdown", - "id": "d9e8b66b", + "id": "0376cdc2", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", From e94a5a864f798ad8cddc3b01a2a899cd205b15e6 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Fri, 12 Dec 2025 15:05:10 -0300 Subject: [PATCH 14/14] changing rule for injecting --- docs/colab_notebooks/1-the-basics.ipynb | 62 ++++++++-------- ...ctured-outputs-and-jinja-expressions.ipynb | 58 +++++++-------- .../3-seeding-with-a-dataset.ipynb | 54 +++++++------- .../4-providing-images-as-context.ipynb | 70 +++++++++---------- docs/scripts/generate_colab_notebooks.py | 10 +-- 5 files changed, 128 insertions(+), 126 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index a132d691..ee9e5ccc 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c95de131", + "id": "a4ac4d55", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "b4c9635c", + "id": "9e9f3c47", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -25,7 +25,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32b06b7f", + "id": "41b31194", "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d4b2540d", + "id": "502b3aba", "metadata": {}, "outputs": [], "source": [ @@ -52,7 +52,7 @@ }, { "cell_type": "markdown", - "id": "090d5b4d", + "id": "8c512fbc", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +63,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81d9e5a1", + "id": "8fae521f", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "004ea5a1", + "id": "e71d0256", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe6a6d01", + "id": "68fc7172", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "e15e3fe6", + "id": "9a821a27", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -123,7 +123,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70e94897", + "id": "a9515141", "metadata": {}, "outputs": [], "source": [ @@ -155,7 +155,7 @@ }, { "cell_type": "markdown", - "id": "84426488", + "id": "3b940ab9", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -170,7 +170,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe15fafd", + "id": "ec21da7e", "metadata": {}, "outputs": [], "source": [ @@ -179,7 +179,7 @@ }, { "cell_type": "markdown", - "id": "a079da7b", + "id": "85b2324e", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b2268fab", + "id": "f49f435e", "metadata": {}, "outputs": [], "source": [ @@ -205,7 +205,7 @@ }, { "cell_type": "markdown", - "id": "77804a37", + "id": "f582b642", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -214,7 +214,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cfdca13c", + "id": "8cfc43b1", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +295,7 @@ }, { "cell_type": "markdown", - "id": "61b952e4", + "id": "2d0eea21", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -304,7 +304,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c890342d", + "id": "b5e65724", "metadata": {}, "outputs": [], "source": [ @@ -341,7 +341,7 @@ }, { "cell_type": "markdown", - "id": "de601ca2", + "id": "e6788771", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -356,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed6e2c88", + "id": "a2705cd9", "metadata": {}, "outputs": [], "source": [ @@ -393,7 +393,7 @@ }, { "cell_type": "markdown", - "id": "cfa7a171", + "id": "e3dd2f69", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -410,7 +410,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d0a2c8fe", + "id": "c6e43147", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d56cc305", + "id": "fab77d01", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a923e1f8", + "id": "875ee6a6", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "87b434bb", + "id": "87b59e4b", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64dd6ed3", + "id": "5d347f4c", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ }, { "cell_type": "markdown", - "id": "449df416", + "id": "d2fb84f2", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6c2deda4", + "id": "71a31e85", "metadata": {}, "outputs": [], "source": [ @@ -487,7 +487,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7e41c983", + "id": "501e9092", "metadata": {}, "outputs": [], "source": [ @@ -500,7 +500,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d5268407", + "id": "6f217b4a", "metadata": {}, "outputs": [], "source": [ @@ -512,7 +512,7 @@ }, { "cell_type": "markdown", - "id": "874083d2", + "id": "4da82b0f", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 7e50b9d4..10156a10 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "62d29d7b", + "id": "a70798bc", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "c10e813d", + "id": "4ed4c65d", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,7 +27,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e0db7e3b", + "id": "73432e8e", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ce081afb", + "id": "1a6aac78", "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "markdown", - "id": "2cb5e676", + "id": "03df6d1c", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f7020536", + "id": "befb6573", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "37d5768b", + "id": "a022d1ae", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -99,7 +99,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e5cf138c", + "id": "568e1d91", "metadata": {}, "outputs": [], "source": [ @@ -108,7 +108,7 @@ }, { "cell_type": "markdown", - "id": "d614fac6", + "id": "de1ef709", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -125,7 +125,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dfbd7bae", + "id": "6757f43c", "metadata": {}, "outputs": [], "source": [ @@ -157,7 +157,7 @@ }, { "cell_type": "markdown", - "id": "fdb0b22f", + "id": "3d1ea9b6", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -172,7 +172,7 @@ { "cell_type": "code", "execution_count": null, - "id": "af64349c", + "id": "f9ad410a", "metadata": {}, "outputs": [], "source": [ @@ -181,7 +181,7 @@ }, { "cell_type": "markdown", - "id": "c07f592f", + "id": "8f918afa", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -208,7 +208,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1b9ae5f1", + "id": "6aafd123", "metadata": {}, "outputs": [], "source": [ @@ -236,7 +236,7 @@ }, { "cell_type": "markdown", - "id": "8893aaf1", + "id": "9727c5ae", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -245,7 +245,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4b78e470", + "id": "9f9df709", "metadata": {}, "outputs": [], "source": [ @@ -354,7 +354,7 @@ }, { "cell_type": "markdown", - "id": "c0d2e97a", + "id": "f42b7843", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -371,7 +371,7 @@ { "cell_type": "code", "execution_count": null, - "id": "79625928", + "id": "d231f52e", "metadata": {}, "outputs": [], "source": [ @@ -425,7 +425,7 @@ }, { "cell_type": "markdown", - "id": "7df7b4af", + "id": "01dbf368", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -442,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e43873ef", + "id": "7d0389c6", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d211b3df", + "id": "b312d760", "metadata": {}, "outputs": [], "source": [ @@ -463,7 +463,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5a7f2e4c", + "id": "6252ae9a", "metadata": {}, "outputs": [], "source": [ @@ -473,7 +473,7 @@ }, { "cell_type": "markdown", - "id": "3b10d9e4", + "id": "ea32a75e", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -486,7 +486,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5b0bc4f2", + "id": "c094a400", "metadata": {}, "outputs": [], "source": [ @@ -496,7 +496,7 @@ }, { "cell_type": "markdown", - "id": "a87e8b8d", + "id": "8565a283", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -509,7 +509,7 @@ { "cell_type": "code", "execution_count": null, - "id": "92a3b906", + "id": "d48a294c", "metadata": {}, "outputs": [], "source": [ @@ -519,7 +519,7 @@ { "cell_type": "code", "execution_count": null, - "id": "48a153cf", + "id": "96268be4", "metadata": {}, "outputs": [], "source": [ @@ -532,7 +532,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53a22fbc", + "id": "a2137aaf", "metadata": {}, "outputs": [], "source": [ @@ -544,7 +544,7 @@ }, { "cell_type": "markdown", - "id": "e26365d8", + "id": "c43a66cc", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index b163c45c..db5c9d27 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "ef3027fd", + "id": "c4c424f2", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "69a3dfd7", + "id": "2ceb13f2", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,7 +27,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2d93af83", + "id": "ad4096cd", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26a826f9", + "id": "21336bf3", "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "markdown", - "id": "45f7ec9f", + "id": "db8fa976", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -65,7 +65,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c9753116", + "id": "c2e17b2b", "metadata": {}, "outputs": [], "source": [ @@ -80,7 +80,7 @@ }, { "cell_type": "markdown", - "id": "11bed01d", + "id": "29c28647", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -93,7 +93,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d7d581d0", + "id": "cecf9242", "metadata": {}, "outputs": [], "source": [ @@ -102,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "9213deef", + "id": "74eab801", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -119,7 +119,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1de03c94", + "id": "b31991e8", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "d2eab698", + "id": "006d1625", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -166,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4cd6bb8d", + "id": "396a4ed5", "metadata": {}, "outputs": [], "source": [ @@ -175,7 +175,7 @@ }, { "cell_type": "markdown", - "id": "736f4dc5", + "id": "9860369b", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -200,7 +200,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c6094cb7", + "id": "65a2fa3c", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ }, { "cell_type": "markdown", - "id": "f14c8397", + "id": "ea11134a", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "589d865b", + "id": "209af41c", "metadata": {}, "outputs": [], "source": [ @@ -325,7 +325,7 @@ }, { "cell_type": "markdown", - "id": "6625b19a", + "id": "32f43d20", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -342,7 +342,7 @@ { "cell_type": "code", "execution_count": null, - "id": "77854ddc", + "id": "402a1025", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b8231cc6", + "id": "de294013", "metadata": {}, "outputs": [], "source": [ @@ -363,7 +363,7 @@ { "cell_type": "code", "execution_count": null, - "id": "176855ed", + "id": "a38c360d", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ }, { "cell_type": "markdown", - "id": "07185bb3", + "id": "4a385031", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -386,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ea0ac5e3", + "id": "3531b1e4", "metadata": {}, "outputs": [], "source": [ @@ -396,7 +396,7 @@ }, { "cell_type": "markdown", - "id": "945ea8c0", + "id": "7fec6251", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "149b509b", + "id": "bace6acb", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +419,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0f1849e6", + "id": "3eaafc74", "metadata": {}, "outputs": [], "source": [ @@ -432,7 +432,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9442e3e2", + "id": "473a0c89", "metadata": {}, "outputs": [], "source": [ @@ -444,7 +444,7 @@ }, { "cell_type": "markdown", - "id": "69e9831e", + "id": "7408eef8", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index b9019ce3..bac63375 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "e2dc539e", + "id": "ebd062e8", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "037eb557", + "id": "3ce5b7f7", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "a03c2c76", + "id": "04d0bfd4", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -36,7 +36,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4c804945", + "id": "661740a3", "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d5b55a9", + "id": "b9c4188b", "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d6d4560f", + "id": "ddaeb938", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ }, { "cell_type": "markdown", - "id": "81cbdc29", + "id": "c3aac5e8", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -84,7 +84,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9d046649", + "id": "c8b905ee", "metadata": {}, "outputs": [], "source": [ @@ -115,7 +115,7 @@ }, { "cell_type": "markdown", - "id": "2411260e", + "id": "f508d655", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -128,7 +128,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26fbb414", + "id": "f0607008", "metadata": {}, "outputs": [], "source": [ @@ -137,7 +137,7 @@ }, { "cell_type": "markdown", - "id": "a103a98c", + "id": "a0467d8c", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -154,7 +154,7 @@ { "cell_type": "code", "execution_count": null, - "id": "796b0217", + "id": "1e4d9a15", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +177,7 @@ }, { "cell_type": "markdown", - "id": "acf04c1a", + "id": "6d66a8a6", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -192,7 +192,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe51662d", + "id": "80ad9b04", "metadata": {}, "outputs": [], "source": [ @@ -201,7 +201,7 @@ }, { "cell_type": "markdown", - "id": "4120847f", + "id": "75935c33", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -218,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39e8f5b2", + "id": "868e41af", "metadata": {}, "outputs": [], "source": [ @@ -233,7 +233,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7931046e", + "id": "5466e56f", "metadata": {}, "outputs": [], "source": [ @@ -281,7 +281,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fe1a3c34", + "id": "bbca6568", "metadata": {}, "outputs": [], "source": [ @@ -299,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1031bd73", + "id": "5ce17e71", "metadata": {}, "outputs": [], "source": [ @@ -309,7 +309,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4000604f", + "id": "9c0faf28", "metadata": {}, "outputs": [], "source": [ @@ -323,7 +323,7 @@ { "cell_type": "code", "execution_count": null, - "id": "41996558", + "id": "35bc8b77", "metadata": { "lines_to_next_cell": 2 }, @@ -352,7 +352,7 @@ }, { "cell_type": "markdown", - "id": "92f765f7", + "id": "16ac3f58", "metadata": { "lines_to_next_cell": 2 }, @@ -360,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "0c78cf89", + "id": "1d13a0e4", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -377,7 +377,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8eac11ab", + "id": "336af89f", "metadata": {}, "outputs": [], "source": [ @@ -387,7 +387,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9dbee16c", + "id": "5a57a536", "metadata": {}, "outputs": [], "source": [ @@ -398,7 +398,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6540f043", + "id": "9e05b387", "metadata": {}, "outputs": [], "source": [ @@ -408,7 +408,7 @@ }, { "cell_type": "markdown", - "id": "a265c96c", + "id": "f69d543e", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -421,7 +421,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7be20be6", + "id": "e3cb66a7", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +431,7 @@ }, { "cell_type": "markdown", - "id": "c72684f2", + "id": "60815241", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -442,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eb3cd699", + "id": "3c9dddf6", "metadata": { "lines_to_next_cell": 2 }, @@ -466,7 +466,7 @@ }, { "cell_type": "markdown", - "id": "297bb3d7", + "id": "eb6e2469", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -479,7 +479,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e9ccd061", + "id": "d6a0ed1c", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a9cc51f5", + "id": "255b8f6f", "metadata": {}, "outputs": [], "source": [ @@ -502,7 +502,7 @@ { "cell_type": "code", "execution_count": null, - "id": "218fb907", + "id": "43b935b3", "metadata": {}, "outputs": [], "source": [ @@ -514,7 +514,7 @@ }, { "cell_type": "markdown", - "id": "0376cdc2", + "id": "a74f1121", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py index 34a43392..cb375776 100644 --- a/docs/scripts/generate_colab_notebooks.py +++ b/docs/scripts/generate_colab_notebooks.py @@ -17,8 +17,6 @@ from nbformat import NotebookNode from nbformat.v4 import new_code_cell, new_markdown_cell -IMPORT_SECTION_MARKER = "### πŸ“¦ Import the essentials" - COLAB_SETUP_MARKDOWN = """\ ### ⚑ Colab Setup @@ -60,12 +58,16 @@ def create_colab_setup_cells(additional_dependencies: str) -> list[NotebookNode] def find_import_section_index(cells: list[NotebookNode]) -> int: """Find the index of the 'Import the essentials' markdown cell.""" + first_code_cell_index = -1 for i, cell in enumerate(cells): + if first_code_cell_index == -1 and cell.get("cell_type") == "code": + first_code_cell_index = i + if cell.get("cell_type") == "markdown": source = cell.get("source", "") - if IMPORT_SECTION_MARKER in source: + if "import" in source.lower() and "essentials" in source.lower(): return i - return -1 + return first_code_cell_index def process_notebook(notebook: NotebookNode, source_path: Path) -> NotebookNode: