enh(model_loaders): Add GPU support. (#75)

daavoo · web-flow · commit 87569b49903e · 2025-01-09T11:09:03.000+01:00
Set n_gpu_layers based on torch.cuda.is_available
diff --git a/src/document_to_podcast/inference/model_loaders.py b/src/document_to_podcast/inference/model_loaders.py
@@ -1,5 +1,6 @@
 from typing import Tuple
 
+import torch
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 from outetts import GGUFModelConfig_v1, InterfaceGGUF
@@ -30,13 +31,12 @@ def load_llama_cpp_model(
         # 0 means that the model limit will be used, instead of the default (512) or other hardcoded value
         n_ctx=0,
         verbose=False,
+        n_gpu_layers=-1 if torch.cuda.is_available() else 0,
     )
     return model
 
 
-def load_outetts_model(
-    model_id: str, language: str = "en", device: str = "cpu"
-) -> InterfaceGGUF:
+def load_outetts_model(model_id: str, language: str = "en") -> InterfaceGGUF:
     """
     Loads the given model_id using the OuteTTS interface. For more info: https://github.com/edwko/OuteTTS
 
@@ -47,43 +47,43 @@ def load_outetts_model(
         model_id (str): The model id to load.
             Format is expected to be `{org}/{repo}/{filename}`.
         language (str): Supported languages in 0.2-500M: en, zh, ja, ko.
-        device (str): The device to load the model on, such as "cuda:0" or "cpu".
 
     Returns:
         PreTrainedModel: The loaded model.
     """
-    n_layers_on_gpu = 0 if device == "cpu" else -1
     model_version = model_id.split("-")[1]
 
     org, repo, filename = model_id.split("/")
     local_path = hf_hub_download(repo_id=f"{org}/{repo}", filename=filename)
     model_config = GGUFModelConfig_v1(
-        model_path=local_path, language=language, n_gpu_layers=n_layers_on_gpu
+        model_path=local_path,
+        language=language,
+        n_gpu_layers=-1 if torch.cuda.is_available else 0,
+        additional_model_config={"verbose": False},
     )
 
     return InterfaceGGUF(model_version=model_version, cfg=model_config)
 
 
 def load_parler_tts_model_and_tokenizer(
-    model_id: str, device: str = "cpu"
+    model_id: str,
 ) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:
     """
     Loads the given model_id using parler_tts.from_pretrained. For more info: https://github.com/huggingface/parler-tts
 
     Examples:
-        >>> model, tokenizer = load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")
+        >>> model, tokenizer = load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1")
 
     Args:
         model_id (str): The model id to load.
             Format is expected to be `{repo}/{filename}`.
-        device (str): The device to load the model on, such as "cuda:0" or "cpu".
 
     Returns:
         PreTrainedModel: The loaded model.
     """
     from parler_tts import ParlerTTSForConditionalGeneration
 
-    model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
+    model = ParlerTTSForConditionalGeneration.from_pretrained(model_id)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     return model, tokenizer