wip stable diffusion for neuron

oOraph · oOraph · commit 1756a6df7a24 · 2024-05-14T11:04:17.000+02:00
Signed-off-by: Raphael Glon &lt;oOraph@users.noreply.github.com&gt;
diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -1,7 +1,9 @@
 import importlib.util
 import logging
+import os
 
 from transformers.utils.import_utils import is_torch_bf16_gpu_available
+from optimum import neuron
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)
@@ -49,14 +51,64 @@ def __call__(
         out = self.pipeline(prompt, num_images_per_prompt=1, **kwargs)
         return out.images[0]
 
-
-DIFFUSERS_TASKS = {
-    "text-to-image": IEAutoPipelineForText2Image,
-}
-
-
-def get_diffusers_pipeline(task=None, model_dir=None, device=-1, **kwargs):
+#
+# DIFFUSERS_TASKS = {
+#     "text-to-image": [NeuronStableDiffusionXLPipeline],
+# }
+
+
+def load_optimum_diffusion_pipeline(task, model_dir):
+
+    # Step 1: load config and look for _class_name
+    try:
+        config = StableDiffusionPipeline.load_config(pretrained_model_name_or_path=model_dir)
+    except OSError as e:
+        logger.error("Unable to load config file for repository %s", model_dir)
+        logger.exception(e)
+        raise
+
+    pipeline_class_name = config['_class_name']
+
+    logger.debug("Repository pipeline class name %s", pipeline_class_name)
+    if pipeline_class_name.contains("Diffusion") and pipeline_class_name.contains("XL"):
+        if task == "image-to-image":
+            pipeline_class = neuron.NeuronStableDiffusionXLImg2ImgPipeline
+        else:
+            pipeline_class = neuron.NeuronStableDiffusionXLPipeline
+    else:
+        if task == "image-to-image":
+            pipeline_class = neuron.NeuronStableDiffusionImg2ImgPipeline
+        else:
+            pipeline_class = neuron.NeuronStableDiffusionPipeline
+
+    logger.debug("Pipeline class %s", pipeline_class.__class__)
+
+    # if is neuron model, no need for additional kwargs
+    if pipeline_class_name.contains("Neuron"):
+        kwargs = {}
+    else:
+        # Model will be compiled and exported on the flight as the cached models cause a performance drop
+        # for diffusion models, unless otherwise specified through an explicit env variable
+
+        # Image shapes need to be frozen at loading/compilation time
+        compiler_args = {
+            "auto_cast": "matmul",
+            "auto_cast_type": "bf16",
+            "inline_weights_to_neff": os.environ.get("INLINE_WEIGHTS_TO_NEFF",
+                                                     "false").lower() in ["false", "no", "0"],
+            "data_parallel_mode": os.environ.get("DATA_PARALLEL_MODE", "unet")
+        }
+        input_shapes = {"batch_size": 1,
+                        "height": int(os.environ("IMAGE_HEIGHT", 512)),
+                        "width": int(os.environ("IMAGE_WIDTH", 512))}
+        kwargs = {**compiler_args, **input_shapes, "export": True}
+
+    # In the second case, exporting can take a huge amount of time, which makes endpoints not a really suited solution
+    # at least as long as the cache is not really an option for diffusion
+    return pipeline_class(kwargs)
+
+
+def get_diffusers_pipeline(task=None, model_dir=None, **kwargs):
     """Get a pipeline for Diffusers models."""
-    device = "cuda" if device == 0 else "cpu"
-    pipeline = DIFFUSERS_TASKS[task](model_dir=model_dir, device=device)
+    pipeline = load_optimum_diffusion_pipeline(task=task, model_dir=model_dir)
     return pipeline
diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py
@@ -33,7 +33,7 @@
 
 
 def is_optimum_available():
-    return False
+    return True
     # TODO: change when supported
     # return _optimum_available
 
@@ -229,7 +229,7 @@ def get_pipeline(
     create pipeline class for a specific task based on local saved model
     """
     device = get_device()
-    logger.info(f"Using device { 'GPU' if device == 0 else 'CPU'}")
+    logger.info(f"Using device { 'GPU' if device == 0 else 'CPU/TPU/Neuron...)'}")
 
     if task is None:
         raise EnvironmentError(
@@ -265,11 +265,10 @@ def get_pipeline(
             device=device,
             **kwargs
         )
-    elif is_diffusers_available() and task == "text-to-image":
+    elif is_diffusers_available() and task in ["text-to-image", "image-to-image"]:
         hf_pipeline = get_diffusers_pipeline(
             task=task,
             model_dir=model_dir,
-            device=device,
             **kwargs
         )
     else:
@@ -308,3 +307,21 @@ def convert_params_to_int_or_bool(params):
         if v == "true":
             params[k] = True
     return params
+
+
+# def local_model_card(model_dir: str) -> Optional[ModelCard]:
+#
+#     logger.debug("Rebuilding offline model info for repo %s", model_dir)
+#
+#     # Let's rebuild some partial model info from what we see in cache, info extracted should be enough
+#     # for most use cases
+#
+#     card_path = Path(model_dir) / "README.md"
+#     if not card_path.exists():
+#         logger.debug("Unable to build model info for directory %s", model_dir)
+#         return None
+#
+#     logger.debug("Loading model card from model readme %s", card_path)
+#     model_card = ModelCard.load(card_path)
+#     logger.info("Local repo %s, model card data %s", model_dir, model_card.data.to_dict())
+#     return model_card