[Tutorials] Lazy import GPU modules in the Llama Nemotron tutorial (#831)

Maghoumi · sarahyurick · web-flow · commit 5aabe4cca9da · 2025-08-04T09:05:40.000-07:00
Signed-off-by: Mehran Maghoumi &lt;Maghoumi@users.noreply.github.com&gt;
Co-authored-by: Sarah Yurick &lt;53962159+sarahyurick@users.noreply.github.com&gt;
diff --git a/tutorials/llama-nemotron-data-curation/README.md b/tutorials/llama-nemotron-data-curation/README.md
@@ -39,7 +39,9 @@ This tutorial demonstrates how a user can process a subset the Llama Nemotron da
 
 Setup requirements:
 
-- Hardware: CPU is sufficient, GPU is recommended for enhanced performance
+- Hardware:
+  - This tutorial can be run entire on a CPU with 4 workers 64GB RAM.
+  - This tutorial can also be run on a single H100 GPU.
 - Recommended environment: This tutorial was developed and tested with a Conda environment
 
 Please refer to NeMo Curator's [README](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) for instructions on how to download NeMo Curator via PyPI, source, or Docker.
diff --git a/tutorials/llama-nemotron-data-curation/main.py b/tutorials/llama-nemotron-data-curation/main.py
@@ -12,19 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import argparse
 import os
 import time
 from itertools import zip_longest
+from typing import TYPE_CHECKING
 
-import cudf
 import dask.dataframe as dd
-import dask_cudf
 import fasttext
 import pandas as pd
 from dask.delayed import delayed
 from transformers import AutoTokenizer
 
+if TYPE_CHECKING:
+    import cudf
+    import dask_cudf
+
 from nemo_curator import ScoreFilter, Sequential
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.filters import DocumentFilter
@@ -366,6 +371,8 @@ def interleave_partitions(
             merged_parts.append(p2)
 
     if gpu:
+        import dask_cudf
+
         return dask_cudf.from_delayed(merged_parts, meta=df1._meta)  # noqa: SLF001
     else:
         return dd.from_delayed(merged_parts, meta=df1._meta)  # noqa: SLF001
@@ -386,6 +393,8 @@ def _interleave_rows(
             rows.append(df2.iloc[i])
 
     if gpu:
+        import cudf
+
         return cudf.DataFrame(rows)
     else:
         return pd.DataFrame(rows)
@@ -408,6 +417,8 @@ def interleave_rows(
         interleaved_parts.append(interleaved)
 
     if gpu:
+        import dask_cudf
+
         return dask_cudf.from_delayed(interleaved_parts, meta=df1._meta)  # noqa: SLF001
     else:
         return dd.from_delayed(interleaved_parts, meta=df1._meta)  # noqa: SLF001
@@ -505,6 +516,8 @@ def main(args: argparse.Namespace) -> None:  # noqa: C901, PLR0915
 
     # Convert to GPU if requested
     if args.device == "gpu":
+        import cudf
+
         print("Converting to GPU")
         dataset_df = dataset_df.map_partitions(lambda partition: cudf.from_pandas(partition))