diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index ee9e5ccc..eb9db753 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a4ac4d55", + "id": "39d7d274", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "9e9f3c47", + "id": "60f1d002", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -25,17 +25,18 @@ { "cell_type": "code", "execution_count": null, - "id": "41b31194", + "id": "99c42292", "metadata": {}, "outputs": [], "source": [ - "!pip install -qU data-designer" + "%%capture\n", + "!pip install -U data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "502b3aba", + "id": "2c959ca9", "metadata": {}, "outputs": [], "source": [ @@ -52,7 +53,7 @@ }, { "cell_type": "markdown", - "id": "8c512fbc", + "id": "bc185897", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -63,7 +64,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8fae521f", + "id": "dc3a2d9d", "metadata": {}, "outputs": [], "source": [ @@ -84,20 +85,20 @@ }, { "cell_type": "markdown", - "id": "e71d0256", + "id": "36c5f571", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", "\n", "- `DataDesigner` is the main object is responsible for managing the data generation process.\n", "\n", - "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used.\n" + "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "68fc7172", + "id": "61b23c70", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +107,7 @@ }, { "cell_type": "markdown", - "id": "9a821a27", + "id": "3c9b7cb6", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -115,7 +116,7 @@ "\n", "- The \"model alias\" is used to reference the model in the Data Designer config (as we will see below).\n", "\n", - "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details).\n", + "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) docs for more details).\n", "\n", "- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider.\n" ] @@ -123,7 +124,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a9515141", + "id": "b86f6217", "metadata": {}, "outputs": [], "source": [ @@ -155,7 +156,7 @@ }, { "cell_type": "markdown", - "id": "3b940ab9", + "id": "1f089871", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -170,7 +171,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ec21da7e", + "id": "3d666193", "metadata": {}, "outputs": [], "source": [ @@ -179,7 +180,7 @@ }, { "cell_type": "markdown", - "id": "85b2324e", + "id": "e88c8881", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -196,7 +197,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f49f435e", + "id": "79fb85c6", "metadata": {}, "outputs": [], "source": [ @@ -205,7 +206,7 @@ }, { "cell_type": "markdown", - "id": "f582b642", + "id": "5106cc10", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -214,7 +215,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8cfc43b1", + "id": "22b97af1", "metadata": {}, "outputs": [], "source": [ @@ -295,7 +296,7 @@ }, { "cell_type": "markdown", - "id": "2d0eea21", + "id": "4857b085", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -304,7 +305,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b5e65724", + "id": "9e90b3cb", "metadata": {}, "outputs": [], "source": [ @@ -341,7 +342,7 @@ }, { "cell_type": "markdown", - "id": "e6788771", + "id": "b36a153b", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -356,7 +357,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a2705cd9", + "id": "4da88fe6", "metadata": {}, "outputs": [], "source": [ @@ -393,7 +394,7 @@ }, { "cell_type": "markdown", - "id": "e3dd2f69", + "id": "5f1b9ac8", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -410,7 +411,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c6e43147", + "id": "543e2f9c", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +421,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fab77d01", + "id": "26136a8a", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +432,7 @@ { "cell_type": "code", "execution_count": null, - "id": "875ee6a6", + "id": "aca4360d", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +442,7 @@ }, { "cell_type": "markdown", - "id": "87b59e4b", + "id": "35ca0470", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -454,7 +455,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5d347f4c", + "id": "d55b402d", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +465,7 @@ }, { "cell_type": "markdown", - "id": "d2fb84f2", + "id": "245b48cf", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -477,7 +478,7 @@ { "cell_type": "code", "execution_count": null, - "id": "71a31e85", + "id": "fc803eb0", "metadata": {}, "outputs": [], "source": [ @@ -487,7 +488,7 @@ { "cell_type": "code", "execution_count": null, - "id": "501e9092", + "id": "881c2043", "metadata": {}, "outputs": [], "source": [ @@ -500,7 +501,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6f217b4a", + "id": "d79860d4", "metadata": {}, "outputs": [], "source": [ @@ -512,16 +513,18 @@ }, { "cell_type": "markdown", - "id": "4da82b0f", + "id": "b4b45176", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", "\n", "Now that you've seen the basics of Data Designer, check out the following notebooks to learn more about:\n", "\n", - "- [Structured outputs and jinja expressions](/notebooks/2-structured-outputs-and-jinja-expressions/)\n", + "- [Structured outputs and jinja expressions](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/2-structured-outputs-and-jinja-expressions/)\n", "\n", - "- [Seeding synthetic data generation with an external dataset](/notebooks/3-seeding-with-a-dataset/)\n" + "- [Seeding synthetic data generation with an external dataset](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/3-seeding-with-a-dataset/)\n", + "\n", + "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n" ] } ], diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 10156a10..0cb65da6 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a70798bc", + "id": "33b48b4e", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -11,12 +11,12 @@ "\n", "In this notebook, we will continue our exploration of Data Designer, demonstrating more advanced data generation using structured outputs and Jinja expressions.\n", "\n", - "If this is your first time using Data Designer, we recommend starting with the [first notebook](/notebooks/1-the-basics/) in this tutorial series.\n" + "If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series.\n" ] }, { "cell_type": "markdown", - "id": "4ed4c65d", + "id": "c29f9af1", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,17 +27,18 @@ { "cell_type": "code", "execution_count": null, - "id": "73432e8e", + "id": "3a5601fb", "metadata": {}, "outputs": [], "source": [ - "!pip install -qU data-designer" + "%%capture\n", + "!pip install -U data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "1a6aac78", + "id": "de2f0af4", "metadata": {}, "outputs": [], "source": [ @@ -54,7 +55,7 @@ }, { "cell_type": "markdown", - "id": "03df6d1c", + "id": "400795be", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -65,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "befb6573", + "id": "378f6853", "metadata": {}, "outputs": [], "source": [ @@ -86,20 +87,20 @@ }, { "cell_type": "markdown", - "id": "a022d1ae", + "id": "15a1ac9f", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", "\n", "- `DataDesigner` is the main object that is used to interface with the library.\n", "\n", - "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used.\n" + "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "568e1d91", + "id": "9d7654e5", "metadata": {}, "outputs": [], "source": [ @@ -108,7 +109,7 @@ }, { "cell_type": "markdown", - "id": "de1ef709", + "id": "27ba0edb", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +118,7 @@ "\n", "- The \"model alias\" is used to reference the model in the Data Designer config (as we will see below).\n", "\n", - "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details).\n", + "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) docs for more details).\n", "\n", "- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider.\n" ] @@ -125,7 +126,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6757f43c", + "id": "c24ee00e", "metadata": {}, "outputs": [], "source": [ @@ -157,7 +158,7 @@ }, { "cell_type": "markdown", - "id": "3d1ea9b6", + "id": "a106edc9", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -172,7 +173,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9ad410a", + "id": "3a167f7c", "metadata": {}, "outputs": [], "source": [ @@ -181,7 +182,7 @@ }, { "cell_type": "markdown", - "id": "8f918afa", + "id": "fcf68c72", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -208,7 +209,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6aafd123", + "id": "8f8f034e", "metadata": {}, "outputs": [], "source": [ @@ -236,7 +237,7 @@ }, { "cell_type": "markdown", - "id": "9727c5ae", + "id": "9d5c722a", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -245,7 +246,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9f9df709", + "id": "013caa3d", "metadata": {}, "outputs": [], "source": [ @@ -354,7 +355,7 @@ }, { "cell_type": "markdown", - "id": "f42b7843", + "id": "ef426a65", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -371,7 +372,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d231f52e", + "id": "27abbd6d", "metadata": {}, "outputs": [], "source": [ @@ -425,7 +426,7 @@ }, { "cell_type": "markdown", - "id": "01dbf368", + "id": "18a8461e", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -442,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d0389c6", + "id": "2f75eee6", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +453,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b312d760", + "id": "950c9596", "metadata": {}, "outputs": [], "source": [ @@ -463,7 +464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6252ae9a", + "id": "5c04ca5a", "metadata": {}, "outputs": [], "source": [ @@ -473,7 +474,7 @@ }, { "cell_type": "markdown", - "id": "ea32a75e", + "id": "b0704f47", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -486,7 +487,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c094a400", + "id": "54a609ce", "metadata": {}, "outputs": [], "source": [ @@ -496,7 +497,7 @@ }, { "cell_type": "markdown", - "id": "8565a283", + "id": "6ae0a8a5", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -509,7 +510,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d48a294c", + "id": "e22c387a", "metadata": {}, "outputs": [], "source": [ @@ -519,7 +520,7 @@ { "cell_type": "code", "execution_count": null, - "id": "96268be4", + "id": "9c6b36b3", "metadata": {}, "outputs": [], "source": [ @@ -532,7 +533,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a2137aaf", + "id": "138ed487", "metadata": {}, "outputs": [], "source": [ @@ -544,14 +545,16 @@ }, { "cell_type": "markdown", - "id": "c43a66cc", + "id": "fde73253", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", "\n", "Check out the following notebook to learn more about:\n", "\n", - "- [Seeding synthetic data generation with an external dataset](/notebooks/3-seeding-with-a-dataset/)\n" + "- [Seeding synthetic data generation with an external dataset](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/3-seeding-with-a-dataset/)\n", + "\n", + "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)A\n" ] } ], diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index db5c9d27..e623ac8b 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c4c424f2", + "id": "5a6b2b3f", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -11,12 +11,12 @@ "\n", "In this notebook, we will demonstrate how to seed synthetic data generation in Data Designer with an external dataset.\n", "\n", - "If this is your first time using Data Designer, we recommend starting with the [first notebook](/notebooks/1-the-basics/) in this tutorial series.\n" + "If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series.\n" ] }, { "cell_type": "markdown", - "id": "2ceb13f2", + "id": "137d8273", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -27,17 +27,18 @@ { "cell_type": "code", "execution_count": null, - "id": "ad4096cd", + "id": "f6ce3dc2", "metadata": {}, "outputs": [], "source": [ - "!pip install -qU data-designer" + "%%capture\n", + "!pip install -U data-designer" ] }, { "cell_type": "code", "execution_count": null, - "id": "21336bf3", + "id": "70d6ffc8", "metadata": {}, "outputs": [], "source": [ @@ -54,7 +55,7 @@ }, { "cell_type": "markdown", - "id": "db8fa976", + "id": "ce0313c9", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -65,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c2e17b2b", + "id": "2aa2cdf1", "metadata": {}, "outputs": [], "source": [ @@ -80,20 +81,20 @@ }, { "cell_type": "markdown", - "id": "29c28647", + "id": "9769f392", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", "\n", "- `DataDesigner` is the main object is responsible for managing the data generation process.\n", "\n", - "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used.\n" + "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "cecf9242", + "id": "d79db916", "metadata": {}, "outputs": [], "source": [ @@ -102,7 +103,7 @@ }, { "cell_type": "markdown", - "id": "74eab801", + "id": "08dd3894", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -111,7 +112,7 @@ "\n", "- The \"model alias\" is used to reference the model in the Data Designer config (as we will see below).\n", "\n", - "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details).\n", + "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) docs for more details).\n", "\n", "- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider.\n" ] @@ -119,7 +120,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b31991e8", + "id": "3994368e", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +152,7 @@ }, { "cell_type": "markdown", - "id": "006d1625", + "id": "5f12d6d2", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -166,7 +167,7 @@ { "cell_type": "code", "execution_count": null, - "id": "396a4ed5", + "id": "0f3d7640", "metadata": {}, "outputs": [], "source": [ @@ -175,7 +176,7 @@ }, { "cell_type": "markdown", - "id": "9860369b", + "id": "1f08df99", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -200,7 +201,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65a2fa3c", + "id": "f265e74c", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +219,7 @@ }, { "cell_type": "markdown", - "id": "ea11134a", + "id": "6bffa239", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -235,7 +236,7 @@ { "cell_type": "code", "execution_count": null, - "id": "209af41c", + "id": "15e486a6", "metadata": {}, "outputs": [], "source": [ @@ -325,7 +326,7 @@ }, { "cell_type": "markdown", - "id": "32f43d20", + "id": "5cfe2edd", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -342,7 +343,7 @@ { "cell_type": "code", "execution_count": null, - "id": "402a1025", + "id": "d4a59576", "metadata": {}, "outputs": [], "source": [ @@ -352,7 +353,7 @@ { "cell_type": "code", "execution_count": null, - "id": "de294013", + "id": "1c5aedd4", "metadata": {}, "outputs": [], "source": [ @@ -363,7 +364,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a38c360d", + "id": "d17df0a5", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +374,7 @@ }, { "cell_type": "markdown", - "id": "4a385031", + "id": "3389a088", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -386,7 +387,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3531b1e4", + "id": "b0443498", "metadata": {}, "outputs": [], "source": [ @@ -396,7 +397,7 @@ }, { "cell_type": "markdown", - "id": "7fec6251", + "id": "0527a606", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -409,7 +410,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bace6acb", + "id": "2118b49e", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3eaafc74", + "id": "c4f9ad59", "metadata": {}, "outputs": [], "source": [ @@ -432,7 +433,7 @@ { "cell_type": "code", "execution_count": null, - "id": "473a0c89", + "id": "8517866d", "metadata": {}, "outputs": [], "source": [ @@ -444,12 +445,14 @@ }, { "cell_type": "markdown", - "id": "7408eef8", + "id": "b62dd069", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", "\n", - "Use Data Designer to generate synthetic data for your specific use case!\n" + "Check out the following notebook to learn more about:\n", + "\n", + "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n" ] } ], diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index bac63375..e48f2bde 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "ebd062e8", + "id": "fc93d603", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "3ce5b7f7", + "id": "31146c45", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -20,12 +20,12 @@ "- ✨ **Visual Document Processing**: Converting images to chat-ready format for model consumption\n", "- πŸ” **Vision-Language Generation**: Using vision models to generate detailed summaries from images\n", "\n", - "If this is your first time using Data Designer, we recommend starting with the [first notebook](/notebooks/1-the-basics/) in this tutorial series.\n" + "If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series.\n" ] }, { "cell_type": "markdown", - "id": "04d0bfd4", + "id": "b237af81", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -36,27 +36,18 @@ { "cell_type": "code", "execution_count": null, - "id": "661740a3", + "id": "00b316a6", "metadata": {}, "outputs": [], "source": [ - "!pip install -qU data-designer" + "%%capture\n", + "!pip install -U data-designer pillow>=12.0.0" ] }, { "cell_type": "code", "execution_count": null, - "id": "b9c4188b", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -q pillow>=12.0.0" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ddaeb938", + "id": "847a6f88", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +64,7 @@ }, { "cell_type": "markdown", - "id": "c3aac5e8", + "id": "5a0e2a31", "metadata": {}, "source": [ "### πŸ“¦ Import the essentials\n", @@ -84,7 +75,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c8b905ee", + "id": "7ec632f1", "metadata": {}, "outputs": [], "source": [ @@ -115,20 +106,20 @@ }, { "cell_type": "markdown", - "id": "f508d655", + "id": "66efe0cc", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", "\n", "- `DataDesigner` is the main object is responsible for managing the data generation process.\n", "\n", - "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used.\n" + "- When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "f0607008", + "id": "e9059d31", "metadata": {}, "outputs": [], "source": [ @@ -137,7 +128,7 @@ }, { "cell_type": "markdown", - "id": "a0467d8c", + "id": "26d60e67", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -146,7 +137,7 @@ "\n", "- The \"model alias\" is used to reference the model in the Data Designer config (as we will see below).\n", "\n", - "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details).\n", + "- The \"model provider\" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) docs for more details).\n", "\n", "- By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider.\n" ] @@ -154,7 +145,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1e4d9a15", + "id": "e5b29b57", "metadata": {}, "outputs": [], "source": [ @@ -177,7 +168,7 @@ }, { "cell_type": "markdown", - "id": "6d66a8a6", + "id": "c7c68fce", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -192,7 +183,7 @@ { "cell_type": "code", "execution_count": null, - "id": "80ad9b04", + "id": "2ab84fb1", "metadata": {}, "outputs": [], "source": [ @@ -201,7 +192,7 @@ }, { "cell_type": "markdown", - "id": "75935c33", + "id": "bdc1fa29", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -218,7 +209,7 @@ { "cell_type": "code", "execution_count": null, - "id": "868e41af", + "id": "3baa7ba2", "metadata": {}, "outputs": [], "source": [ @@ -233,7 +224,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5466e56f", + "id": "f3780aee", "metadata": {}, "outputs": [], "source": [ @@ -281,7 +272,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bbca6568", + "id": "5c46877e", "metadata": {}, "outputs": [], "source": [ @@ -299,7 +290,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5ce17e71", + "id": "4ca338da", "metadata": {}, "outputs": [], "source": [ @@ -309,7 +300,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c0faf28", + "id": "0cb41a37", "metadata": {}, "outputs": [], "source": [ @@ -323,7 +314,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35bc8b77", + "id": "607bb265", "metadata": { "lines_to_next_cell": 2 }, @@ -352,7 +343,7 @@ }, { "cell_type": "markdown", - "id": "16ac3f58", + "id": "984241b9", "metadata": { "lines_to_next_cell": 2 }, @@ -360,7 +351,7 @@ }, { "cell_type": "markdown", - "id": "1d13a0e4", + "id": "eca1a5ea", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -377,7 +368,7 @@ { "cell_type": "code", "execution_count": null, - "id": "336af89f", + "id": "4d386a38", "metadata": {}, "outputs": [], "source": [ @@ -387,7 +378,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5a57a536", + "id": "6209d609", "metadata": {}, "outputs": [], "source": [ @@ -398,7 +389,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e05b387", + "id": "5aedc7fd", "metadata": {}, "outputs": [], "source": [ @@ -408,7 +399,7 @@ }, { "cell_type": "markdown", - "id": "f69d543e", + "id": "fc339219", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -421,7 +412,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e3cb66a7", + "id": "87ccc372", "metadata": {}, "outputs": [], "source": [ @@ -431,7 +422,7 @@ }, { "cell_type": "markdown", - "id": "60815241", + "id": "c090f413", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -442,7 +433,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c9dddf6", + "id": "1e1a054c", "metadata": { "lines_to_next_cell": 2 }, @@ -466,7 +457,7 @@ }, { "cell_type": "markdown", - "id": "eb6e2469", + "id": "cab83636", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -479,7 +470,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d6a0ed1c", + "id": "7fa66a2e", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +480,7 @@ { "cell_type": "code", "execution_count": null, - "id": "255b8f6f", + "id": "8f92b5be", "metadata": {}, "outputs": [], "source": [ @@ -502,7 +493,7 @@ { "cell_type": "code", "execution_count": null, - "id": "43b935b3", + "id": "d1bddcee", "metadata": {}, "outputs": [], "source": [ @@ -514,7 +505,7 @@ }, { "cell_type": "markdown", - "id": "a74f1121", + "id": "46f68f95", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/notebook_source/1-the-basics.py b/docs/notebook_source/1-the-basics.py index 1c3c200b..de890fb0 100644 --- a/docs/notebook_source/1-the-basics.py +++ b/docs/notebook_source/1-the-basics.py @@ -46,7 +46,7 @@ # # - `DataDesigner` is the main object is responsible for managing the data generation process. # -# - When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used. +# - When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used. # # %% @@ -59,7 +59,7 @@ # # - The "model alias" is used to reference the model in the Data Designer config (as we will see below). # -# - The "model provider" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details). +# - The "model provider" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) docs for more details). # # - By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider. # @@ -336,7 +336,9 @@ # # Now that you've seen the basics of Data Designer, check out the following notebooks to learn more about: # -# - [Structured outputs and jinja expressions](/notebooks/2-structured-outputs-and-jinja-expressions/) +# - [Structured outputs and jinja expressions](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/2-structured-outputs-and-jinja-expressions/) # -# - [Seeding synthetic data generation with an external dataset](/notebooks/3-seeding-with-a-dataset/) +# - [Seeding synthetic data generation with an external dataset](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/3-seeding-with-a-dataset/) +# +# - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/) # diff --git a/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py b/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py index e099cd82..f968a416 100644 --- a/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py +++ b/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py @@ -19,7 +19,7 @@ # # In this notebook, we will continue our exploration of Data Designer, demonstrating more advanced data generation using structured outputs and Jinja expressions. # -# If this is your first time using Data Designer, we recommend starting with the [first notebook](/notebooks/1-the-basics/) in this tutorial series. +# If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series. # # %% [markdown] @@ -48,7 +48,7 @@ # # - `DataDesigner` is the main object that is used to interface with the library. # -# - When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used. +# - When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used. # # %% @@ -61,7 +61,7 @@ # # - The "model alias" is used to reference the model in the Data Designer config (as we will see below). # -# - The "model provider" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details). +# - The "model provider" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) docs for more details). # # - By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider. # @@ -380,5 +380,7 @@ class ProductReview(BaseModel): # # Check out the following notebook to learn more about: # -# - [Seeding synthetic data generation with an external dataset](/notebooks/3-seeding-with-a-dataset/) +# - [Seeding synthetic data generation with an external dataset](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/3-seeding-with-a-dataset/) +# +# - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)A # diff --git a/docs/notebook_source/3-seeding-with-a-dataset.py b/docs/notebook_source/3-seeding-with-a-dataset.py index cdd1f744..7c0d07e1 100644 --- a/docs/notebook_source/3-seeding-with-a-dataset.py +++ b/docs/notebook_source/3-seeding-with-a-dataset.py @@ -19,7 +19,7 @@ # # In this notebook, we will demonstrate how to seed synthetic data generation in Data Designer with an external dataset. # -# If this is your first time using Data Designer, we recommend starting with the [first notebook](/notebooks/1-the-basics/) in this tutorial series. +# If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series. # # %% [markdown] @@ -42,7 +42,7 @@ # # - `DataDesigner` is the main object is responsible for managing the data generation process. # -# - When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used. +# - When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used. # # %% @@ -55,7 +55,7 @@ # # - The "model alias" is used to reference the model in the Data Designer config (as we will see below). # -# - The "model provider" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details). +# - The "model provider" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) docs for more details). # # - By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider. # @@ -288,5 +288,7 @@ # %% [markdown] # ## ⏭️ Next Steps # -# Use Data Designer to generate synthetic data for your specific use case! +# Check out the following notebook to learn more about: +# +# - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/) # diff --git a/docs/notebook_source/4-providing-images-as-context.py b/docs/notebook_source/4-providing-images-as-context.py index dc2513cd..6265e01f 100644 --- a/docs/notebook_source/4-providing-images-as-context.py +++ b/docs/notebook_source/4-providing-images-as-context.py @@ -23,7 +23,7 @@ # - ✨ **Visual Document Processing**: Converting images to chat-ready format for model consumption # - πŸ” **Vision-Language Generation**: Using vision models to generate detailed summaries from images # -# If this is your first time using Data Designer, we recommend starting with the [first notebook](/notebooks/1-the-basics/) in this tutorial series. +# If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series. # # %% [markdown] @@ -62,7 +62,7 @@ # # - `DataDesigner` is the main object is responsible for managing the data generation process. # -# - When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) are used. +# - When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used. # # %% @@ -75,7 +75,7 @@ # # - The "model alias" is used to reference the model in the Data Designer config (as we will see below). # -# - The "model provider" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/concepts/models/default-model-settings/) docs for more details). +# - The "model provider" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) docs for more details). # # - By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider. # diff --git a/docs/scripts/generate_colab_notebooks.py b/docs/scripts/generate_colab_notebooks.py index cb375776..446b0807 100644 --- a/docs/scripts/generate_colab_notebooks.py +++ b/docs/scripts/generate_colab_notebooks.py @@ -28,10 +28,8 @@ } COLAB_INSTALL_CELL = """\ -!pip install -qU data-designer""" - -COLAB_DEPENDENCIES_CELL = """\ -!pip install -q {deps}""" +%%capture +!pip install -U data-designer""" COLAB_API_KEY_CELL = """\ import getpass @@ -49,9 +47,12 @@ def create_colab_setup_cells(additional_dependencies: str) -> list[NotebookNode] """Create the Colab-specific setup cells to inject before imports.""" cells = [] cells += [new_markdown_cell(source=COLAB_SETUP_MARKDOWN)] - cells += [new_code_cell(source=COLAB_INSTALL_CELL)] + + install_cell = COLAB_INSTALL_CELL if additional_dependencies: - cells += [new_code_cell(source=COLAB_DEPENDENCIES_CELL.format(deps=additional_dependencies))] + install_cell += f" {additional_dependencies}" + cells += [new_code_cell(source=install_cell)] + cells += [new_code_cell(source=COLAB_API_KEY_CELL)] return cells