Use strings for GPU config (#1066)

erikbern · charlesfrye · web-flow · commit 7fefd0e5fbbb · 2025-02-09T19:10:09.000-05:00
Co-authored-by: Charles Frye &lt;charles@modal.com&gt;
diff --git a/06_gpu_and_ml/dreambooth/diffusers_lora_finetune.py b/06_gpu_and_ml/dreambooth/diffusers_lora_finetune.py
@@ -265,9 +265,7 @@ class TrainConfig(SharedConfig):
 
 @app.function(
     image=image,
-    gpu=modal.gpu.A100(  # fine-tuning is VRAM-heavy and requires a high-VRAM GPU
-        count=1, size="80GB"
-    ),
+    gpu="A100-80GB",  # fine-tuning is VRAM-heavy and requires a high-VRAM GPU
     volumes={MODEL_DIR: volume},  # stores fine-tuned model
     timeout=1800,  # 30 minutes
     secrets=[huggingface_secret]
diff --git a/06_gpu_and_ml/embeddings/text_embeddings_inference.py b/06_gpu_and_ml/embeddings/text_embeddings_inference.py
@@ -14,7 +14,7 @@
 
 import modal
 
-GPU_CONFIG = modal.gpu.A10G()
+GPU_CONFIG = "A10G"
 MODEL_ID = "BAAI/bge-base-en-v1.5"
 BATCH_SIZE = 32
 DOCKER_IMAGE = (
diff --git a/06_gpu_and_ml/embeddings/wikipedia/main.py b/06_gpu_and_ml/embeddings/wikipedia/main.py
@@ -7,7 +7,7 @@
 # We first set out configuration variables for our script.
 ## Embedding Containers Configuration
 GPU_CONCURRENCY = 100
-GPU_CONFIG = modal.gpu.A10G()
+GPU_CONFIG = "A10G"
 MODEL_ID = "BAAI/bge-small-en-v1.5"
 MODEL_SLUG = MODEL_ID.split("/")[-1]
 BATCH_SIZE = 512
diff --git a/06_gpu_and_ml/llm-serving/chat_with_pdf_vision.py b/06_gpu_and_ml/llm-serving/chat_with_pdf_vision.py
@@ -158,7 +158,7 @@ def download_model():
 
 @app.cls(
     image=model_image,
-    gpu=modal.gpu.A100(size="80GB"),
+    gpu="A100-80GB",
     container_idle_timeout=10 * MINUTES,  # spin down when inactive
     volumes={"/vol/pdfs/": pdf_volume, CACHE_DIR: cache_volume},
 )
diff --git a/06_gpu_and_ml/llm-serving/trtllm_llama.py b/06_gpu_and_ml/llm-serving/trtllm_llama.py
@@ -154,7 +154,7 @@ def download_model():
 # about two quadrillion per second on an H100 SXM.
 
 N_GPUS = 1  # Heads up: this example has not yet been tested with multiple GPUs
-GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
+GPU_CONFIG = f"H100:{N_GPUS}"
 
 DTYPE = "float16"  # format we download in, regular fp16
 QFORMAT = "fp8"  # format we quantize the weights to
diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py
@@ -89,7 +89,7 @@
 
 @app.function(
     image=vllm_image,
-    gpu=modal.gpu.H100(count=N_GPU),
+    gpu=f"H100:{N_GPU}",
     container_idle_timeout=5 * MINUTES,
     timeout=24 * HOURS,
     allow_concurrent_inputs=1000,
diff --git a/06_gpu_and_ml/llm-structured/jsonformer_generate.py b/06_gpu_and_ml/llm-structured/jsonformer_generate.py
@@ -58,7 +58,7 @@ def download_model():
 # The generate function takes two arguments `prompt` and `json_schema`, where
 # `prompt` is used to describe the domain of your data (for example, "plants")
 # and the schema contains the JSON schema you want to populate.
-@app.function(gpu=modal.gpu.A10G(), image=image)
+@app.function(gpu="A10G", image=image)
 def generate(prompt: str, json_schema: dict[str, Any]) -> dict[str, Any]:
     from jsonformer import Jsonformer
     from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/06_gpu_and_ml/llm-structured/outlines_generate.py b/06_gpu_and_ml/llm-structured/outlines_generate.py
@@ -106,7 +106,7 @@ def import_model(model_name):
 # We specify that we want to use the Mistral-7B model, and then ask for a character, and we'll receive structured data with the right schema.
 
 
-@app.function(image=outlines_image, gpu=modal.gpu.A100(size="40GB"))
+@app.function(image=outlines_image, gpu="A100-40GB")
 def generate(
     prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
 ):
diff --git a/06_gpu_and_ml/stable_diffusion/image_to_image.py b/06_gpu_and_ml/stable_diffusion/image_to_image.py
@@ -89,7 +89,7 @@ def download_models():
     snapshot_download("stabilityai/sdxl-turbo", ignore_patterns=ignore)
 
 
-@app.cls(gpu=modal.gpu.A10G(), container_idle_timeout=240)
+@app.cls(gpu="A10G", container_idle_timeout=240)
 class Model:
     @modal.enter()
     def enter(self):
diff --git a/06_gpu_and_ml/text-to-video/mochi.py b/06_gpu_and_ml/text-to-video/mochi.py
@@ -133,7 +133,7 @@ def download_model(revision="83359d26a7e2bbe200ecbfda8ebff850fd03b545"):
         OUTPUTS_PATH: outputs,  # videos will be saved to a distributed volume
         MODEL_PATH: model,
     },
-    gpu=modal.gpu.H100(count=1),
+    gpu="H100",
     timeout=1 * HOURS,
 )
 class Mochi:
diff --git a/06_gpu_and_ml/yolo/finetune_yolo.py b/06_gpu_and_ml/yolo/finetune_yolo.py
@@ -116,7 +116,7 @@ def download_dataset(config: DatasetConfig):
 MINUTES = 60
 
 TRAIN_GPU_COUNT = 1
-TRAIN_GPU = modal.gpu.A100(count=TRAIN_GPU_COUNT)
+TRAIN_GPU = f"A100:{TRAIN_GPU_COUNT}"
 TRAIN_CPU_COUNT = 4
 
 
diff --git a/14_clusters/simple_torch_cluster.py b/14_clusters/simple_torch_cluster.py
@@ -53,6 +53,7 @@
 # while `nproc_per_node` is `torch.distributed` configuration ("how many processes should we spawn for you?").
 
 n_proc_per_node = N_GPU = 1
+GPU_CONFIG = f"H100:{N_GPU}"
 
 # Lastly, we need to select our communications library: the software that will handle
 # sending messages between nodes in our cluster.
@@ -92,7 +93,7 @@
 # ```
 
 
-@app.function(gpu=modal.gpu.H100(count=N_GPU))
+@app.function(gpu=GPU_CONFIG)
 @modal.experimental.clustered(size=n_nodes)
 def dist_run_script(*args):
     from torch.distributed.run import parse_args, run
diff --git a/misc/falcon_bitsandbytes.py b/misc/falcon_bitsandbytes.py
@@ -73,7 +73,7 @@ def download_falcon_40b():
 # The rest is just using the [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines)
 # abstraction from the `transformers` library. Refer to the documentation for more parameters and tuning.
 @app.cls(
-    gpu=modal.gpu.A100(),  # Use A100s
+    gpu="A100",
     timeout=60 * 10,  # 10 minute timeout on inputs
     container_idle_timeout=60 * 5,  # Keep runner alive for 5 minutes
 )
diff --git a/misc/falcon_gptq.py b/misc/falcon_gptq.py
@@ -71,7 +71,7 @@ def download_model():
 #
 # Note that we need to create a separate thread to call the `generate` function because we need to
 # yield the text back from the streamer in the main thread. This is an idiosyncrasy with streaming in `transformers`.
-@app.cls(gpu=modal.gpu.A100(), timeout=60 * 10, container_idle_timeout=60 * 5)
+@app.cls(gpu="A100", timeout=60 * 10, container_idle_timeout=60 * 5)
 class Falcon40BGPTQ:
     @modal.enter()
     def load_model(self):
diff --git a/misc/run_fooocus.py b/misc/run_fooocus.py
@@ -61,7 +61,7 @@ def init_Fooocus():
     )
 
 
-GPU_CONFIG = modal.gpu.T4()
+GPU_CONFIG = "T4"
 image = image.run_function(init_Fooocus, gpu=GPU_CONFIG)
 
 # ## Run Fooocus
diff --git a/misc/trellis3d.py b/misc/trellis3d.py
@@ -111,7 +111,7 @@ def clone_repository():
 
 @app.cls(
     image=trellis_image.env({"HF_HUB_CACHE_DIR": cache_dir}),
-    gpu=modal.gpu.L4(count=1),
+    gpu="L4",
     timeout=1 * HOURS,
     container_idle_timeout=1 * MINUTES,
     volumes={cache_dir: cache_vol},

Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,7 @@ def download_model():`
`158`	`158`
`159`	`159`	`@app.cls(`
`160`	`160`	`image=model_image,`
`161`		`- gpu=modal.gpu.A100(size="80GB"),`
	`161`	`+ gpu="A100-80GB",`
`162`	`162`	`container_idle_timeout=10 * MINUTES, # spin down when inactive`
`163`	`163`	`volumes={"/vol/pdfs/": pdf_volume, CACHE_DIR: cache_volume},`
`164`	`164`	`)`
Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ def download_model(revision="83359d26a7e2bbe200ecbfda8ebff850fd03b545"):`
`133`	`133`	`OUTPUTS_PATH: outputs, # videos will be saved to a distributed volume`
`134`	`134`	`MODEL_PATH: model,`
`135`	`135`	`},`
`136`		`- gpu=modal.gpu.H100(count=1),`
	`136`	`+ gpu="H100",`
`137`	`137`	`timeout=1 * HOURS,`
`138`	`138`	`)`
`139`	`139`	`class Mochi:`