invoke-ai
diff --git a/‎invokeai/app/invocations/controlnet_image_processors.py‎
Lines changed: 30 additions & 0 deletions b/‎invokeai/app/invocations/controlnet_image_processors.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎invokeai/backend/image_util/depth_anything/__init__.py‎
Lines changed: 107 additions & 0 deletions b/‎invokeai/backend/image_util/depth_anything/__init__.py‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎invokeai/backend/image_util/depth_anything/model/blocks.py‎
Lines changed: 145 additions & 0 deletions b/‎invokeai/backend/image_util/depth_anything/model/blocks.py‎
Lines changed: 145 additions & 0 deletions
@@ -30,6 +30,7 @@
 from invokeai.app.invocations.util import validate_begin_end_step, validate_weights
 from invokeai.app.services.image_records.image_records_common import ImageCategory, ResourceOrigin
 from invokeai.app.shared.fields import FieldDescriptions
+from invokeai.backend.image_util.depth_anything import DepthAnythingDetector
 
 from ...backend.model_management import BaseModelType
 from .baseinvocation import (
@@ -602,3 +603,32 @@ def run_processor(self, image: Image.Image):
         color_map = cv2.resize(color_map, (width, height), interpolation=cv2.INTER_NEAREST)
         color_map = Image.fromarray(color_map)
         return color_map
+
+
+DEPTH_ANYTHING_MODEL_SIZES = Literal["large", "base", "small"]
+
+
+@invocation(
+    "depth_anything_image_processor",
+    title="Depth Anything Processor",
+    tags=["controlnet", "depth", "depth anything"],
+    category="controlnet",
+    version="1.0.0",
+)
+class DepthAnythingImageProcessorInvocation(ImageProcessorInvocation):
+    """Generates a depth map based on the Depth Anything algorithm"""
+
+    model_size: DEPTH_ANYTHING_MODEL_SIZES = InputField(
+        default="large", description="The size of the depth model to use"
+    )
+    offload: bool = InputField(default=False)
+
+    def run_processor(self, image):
+        depth_anything_detector = DepthAnythingDetector()
+        depth_anything_detector.load_model(model_size=self.model_size)
+
+        if image.mode == "RGBA":
+            image = image.convert("RGB")
+
+        processed_image = depth_anything_detector(image=image, offload=self.offload)
+        return processed_image
@@ -0,0 +1,107 @@
+import pathlib
+from typing import Literal, Union
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from einops import repeat
+from PIL import Image
+from torchvision.transforms import Compose
+
+from invokeai.app.services.config.config_default import InvokeAIAppConfig
+from invokeai.backend.image_util.depth_anything.model.dpt import DPT_DINOv2
+from invokeai.backend.image_util.depth_anything.utilities.util import NormalizeImage, PrepareForNet, Resize
+from invokeai.backend.util.devices import choose_torch_device
+from invokeai.backend.util.util import download_with_progress_bar
+
+config = InvokeAIAppConfig.get_config()
+
+DEPTH_ANYTHING_MODELS = {
+    "large": {
+        "url": "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitl14.pth?download=true",
+        "local": "sd-1/controlnet/annotator/depth_anything/depth_anything_vitl14.pth",
+    },
+    "base": {
+        "url": "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitb14.pth?download=true",
+        "local": "sd-1/controlnet/annotator/depth_anything/depth_anything_vitb14.pth",
+    },
+    "small": {
+        "url": "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vits14.pth?download=true",
+        "local": "sd-1/controlnet/annotator/depth_anything/depth_anything_vits14.pth",
+    },
+}
+
+
+transform = Compose(
+    [
+        Resize(
+            width=518,
+            height=518,
+            resize_target=False,
+            keep_aspect_ratio=True,
+            ensure_multiple_of=14,
+            resize_method="lower_bound",
+            image_interpolation_method=cv2.INTER_CUBIC,
+        ),
+        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        PrepareForNet(),
+    ]
+)
+
+
+class DepthAnythingDetector:
+    def __init__(self) -> None:
+        self.model = None
+        self.model_size: Union[Literal["large", "base", "small"], None] = None
+
+    def load_model(self, model_size=Literal["large", "base", "small"]):
+        DEPTH_ANYTHING_MODEL_PATH = pathlib.Path(config.models_path / DEPTH_ANYTHING_MODELS[model_size]["local"])
+        if not DEPTH_ANYTHING_MODEL_PATH.exists():
+            download_with_progress_bar(DEPTH_ANYTHING_MODELS[model_size]["url"], DEPTH_ANYTHING_MODEL_PATH)
+
+        if not self.model or model_size != self.model_size:
+            del self.model
+            self.model_size = model_size
+
+            if self.model_size == "small":
+                self.model = DPT_DINOv2(encoder="vits", features=64, out_channels=[48, 96, 192, 384], localhub=True)
+            if self.model_size == "base":
+                self.model = DPT_DINOv2(encoder="vitb", features=128, out_channels=[96, 192, 384, 768], localhub=True)
+            if self.model_size == "large":
+                self.model = DPT_DINOv2(
+                    encoder="vitl", features=256, out_channels=[256, 512, 1024, 1024], localhub=True
+                )
+
+            self.model.load_state_dict(torch.load(DEPTH_ANYTHING_MODEL_PATH.as_posix(), map_location="cpu"))
+            self.model.eval()
+
+        self.model.to(choose_torch_device())
+        return self.model
+
+    def to(self, device):
+        self.model.to(device)
+        return self
+
+    def __call__(self, image, offload=False):
+        image = np.array(image, dtype=np.uint8)
+        original_width, original_height = image.shape[:2]
+        image = image[:, :, ::-1] / 255.0
+
+        image_width, image_height = image.shape[:2]
+        image = transform({"image": image})["image"]
+        image = torch.from_numpy(image).unsqueeze(0).to(choose_torch_device())
+
+        with torch.no_grad():
+            depth = self.model(image)
+            depth = F.interpolate(depth[None], (image_height, image_width), mode="bilinear", align_corners=False)[0, 0]
+            depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+
+        depth_map = repeat(depth, "h w -> h w 3").cpu().numpy().astype(np.uint8)
+        depth_map = Image.fromarray(depth_map)
+        depth_map = depth_map.resize((original_height, original_width))
+
+        if offload:
+            del self.model
+
+        return depth_map
@@ -0,0 +1,145 @@
+import torch.nn as nn
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(
+            in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+        )
+
+    return scratch
+
+
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+
+    def __init__(self, features, activation, bn):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups = 1
+
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+
+        if self.bn:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn:
+            out = self.bn1(out)
+
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x)
+
+
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+
+    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups = 1
+
+        self.expand = expand
+        out_features = features
+        if self.expand:
+            out_features = features // 2
+
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+        self.size = size
+
+    def forward(self, *xs, size=None):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+
+        output = self.resConfUnit2(output)
+
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+
+        output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+
+        output = self.out_conv(output)
+
+        return output