Support for Depth Anything

semjonsona · semjonsona · commit db01d5c18d3d · 2024-02-10T16:49:59.000+02:00
Thanks to huchenlei for his Depth Anything integration code for Mikubill/sd-webui-controlnet. This code was useful for this commit.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,8 @@
 ## Changelog
+### 0.4.6
+ * Support for [Depth Anything](https://github.com/LiheYoung/Depth-Anything).
 ### 0.4.5
- * Support for [Marigold](https://marigoldmonodepth.github.io). [PR #385](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/385).
+ * Preliminary support for [Marigold](https://marigoldmonodepth.github.io). [PR #385](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/385).
 ### 0.4.4
  * Compatibility with stable-diffusion-webui 1.6.0
 ### 0.4.3 video processing tab
diff --git a/README.md b/README.md
@@ -94,7 +94,7 @@ Feel free to comment and share in the discussions and submit issues.
 
 ## Acknowledgements
 
-This project relies on code and information from following papers : 
+This project relies on code and information from the following papers : 
 
 MiDaS :
 
@@ -211,3 +211,16 @@ Marigold - Repurposing Diffusion-Based Image Generators for Monocular Depth Esti
       primaryClass={cs.CV}
 }
 ```
+
+Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data
+
+```
+@misc{yang2024depth,
+      title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data}, 
+      author={Lihe Yang and Bingyi Kang and Zilong Huang and Xiaogang Xu and Jiashi Feng and Hengshuang Zhao},
+      year={2024},
+      eprint={2401.10891},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/install.py b/install.py
@@ -1,6 +1,9 @@
+# Installs dependencies
+
 import launch
 import platform
 import sys
+import importlib.metadata
 
 # TODO: some dependencies apparently being reinstalled on every run. Investigate and fix.
 
@@ -54,3 +57,20 @@ def ensure(module_name, min_version=None):
 
 if platform.system() == 'Darwin':
     ensure('pyqt6')
+
+# Depth Anything
+def get_installed_version(package: str):
+    try:
+        return importlib.metadata.version(package)
+    except Exception:
+        return None
+def try_install_from_wheel(pkg_name: str, wheel_url: str):
+    if get_installed_version(pkg_name) is not None:
+        return
+    try:
+        launch.run_pip(f"install {wheel_url}", f" {pkg_name} requirement for depthmap script")
+    except Exception as e:
+        print('Failed to install wheel for Depth Anything support. It won\'t work.')
+try_install_from_wheel(
+    "depth_anything",
+    "https://github.com/huchenlei/Depth-Anything/releases/download/v1.0.0/depth_anything-2024.1.22.0-py2.py3-none-any.whl")
diff --git a/requirements.txt b/requirements.txt
@@ -19,3 +19,4 @@ networkx>=2.5
 diffusers>=0.20.1 # For Marigold
 pyqt5; sys_platform == 'windows'
 pyqt6; sys_platform != 'windows'
+https://github.com/huchenlei/Depth-Anything/releases/download/v1.0.0/depth_anything-2024.1.22.0-py2.py3-none-any.whl
diff --git a/src/common_ui.py b/src/common_ui.py
@@ -38,7 +38,7 @@ def main_ui_panel(is_depth_tab):
                                                       'dpt_hybrid_384 (midas 3.0)',
                                                       'midas_v21', 'midas_v21_small',
                                                       'zoedepth_n (indoor)', 'zoedepth_k (outdoor)', 'zoedepth_nk',
-                                                      'Marigold v1'],
+                                                      'Marigold v1', 'depth_anything'],
                                              type="index")
         with gr.Box() as cur_option_root:
             inp -= 'depthmap_gen_row_1', cur_option_root
diff --git a/src/depthmap_generation.py b/src/depthmap_generation.py
@@ -65,11 +65,15 @@ def ensure_models(self, model_type, device: torch.device, boost: bool):
     def load_models(self, model_type, device: torch.device, boost: bool):
         """Ensure that the depth model is loaded"""
 
+        # TODO: we need to at least try to find models downloaded by other plugins (e.g. controlnet)
+
         # model path and name
         # ZoeDepth and Marigold do not use this
         model_dir = "./models/midas"
         if model_type == 0:
             model_dir = "./models/leres"
+        if model_type == 11:
+            model_dir = "./models/depth_anything"
 
         # create paths to model if not present
         os.makedirs(model_dir, exist_ok=True)
@@ -202,14 +206,31 @@ def load_models(self, model_type, device: torch.device, boost: bool):
             except:
                 pass  # run without xformers
 
+        elif model_type == 11:  # depth_anything
+            from depth_anything.dpt import DPT_DINOv2
+            # This will download the model... to some place
+            model = (
+                DPT_DINOv2(
+                    encoder="vitl",
+                    features=256,
+                    out_channels=[256, 512, 1024, 1024],
+                    localhub=False,
+                ).to(device).eval()
+            )
+            model_path = f"{model_dir}/depth_anything_vitl14.pth"
+            ensure_file_downloaded(model_path,
+                                   "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitl14.pth")
+
+            model.load_state_dict(torch.load(model_path))
+
         if model_type in range(0, 10):
             model.eval()  # prepare for evaluation
         # optimize
         if device == torch.device("cuda"):
             if model_type in [0, 1, 2, 3, 4, 5, 6]:
                 model = model.to(memory_format=torch.channels_last)  # TODO: weird
             if not self.no_half:
-                if model_type in [1, 2, 3, 4, 5, 6] and not boost:  # TODO: zoedepth, too?
+                if model_type in [1, 2, 3, 4, 5, 6] and not boost:  # TODO: zoedepth, Marigold and depth_anything, too?
                     model = model.half()
         model.to(device)  # to correct device
 
@@ -250,7 +271,8 @@ def get_default_net_size(model_type):
             7: [384, 512],
             8: [384, 768],
             9: [384, 512],
-            10: [768, 768]
+            10: [768, 768],
+            11: [518, 518]
         }
         if model_type in sizes:
             return sizes[model_type]
@@ -307,6 +329,8 @@ def get_raw_prediction(self, input, net_width, net_height):
             elif self.depth_model_type == 10:
                 raw_prediction = estimatemarigold(img, self.depth_model, net_width, net_height,
                                                   self.marigold_ensembles, self.marigold_steps)
+            elif self.depth_model_type == 11:
+                raw_prediction = estimatedepthanything(img, self.depth_model, net_width, net_height)
         else:
             raw_prediction = estimateboost(img, self.depth_model, self.depth_model_type, self.pix2pix_model,
                                            self.boost_rmax)
@@ -414,6 +438,7 @@ def estimatemidas(img, model, w, h, resize_mode, normalization, no_half, precisi
 # TODO: "h" is not used
 def estimatemarigold(image, model, w, h, marigold_ensembles=5, marigold_steps=12):
     # This hideous thing should be re-implemented once there is support from the upstream.
+    # TODO: re-implement this hideous thing by using features from the upstream
     img = cv2.cvtColor((image * 255.0001).astype('uint8'), cv2.COLOR_BGR2RGB)
     img = Image.fromarray(img)
     with torch.no_grad():
@@ -423,6 +448,37 @@ def estimatemarigold(image, model, w, h, marigold_ensembles=5, marigold_steps=12
         return cv2.resize(pipe_out.depth_np, (image.shape[:2][::-1]), interpolation=cv2.INTER_CUBIC)
 
 
+def estimatedepthanything(image, model, w, h):
+    from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
+    transform = Compose(
+        [
+            Resize(
+                width=w // 14 * 14,
+                height=h // 14 * 14,
+                resize_target=False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method="lower_bound",
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ]
+    )
+
+    timage = transform({"image": image})["image"]
+    timage = torch.from_numpy(timage).unsqueeze(0).to(next(model.parameters()).device)
+
+    with torch.no_grad():
+        depth = model(timage)
+    import torch.nn.functional as F
+    depth = F.interpolate(
+        depth[None], (image.shape[0], image.shape[1]), mode="bilinear", align_corners=False
+    )[0, 0]
+
+    return depth.cpu().numpy()
+
+
 class ImageandPatchs:
     def __init__(self, root_dir, name, patchsinfo, rgb_image, scale=1):
         self.root_dir = root_dir
@@ -640,13 +696,14 @@ def estimateboost(img, model, model_type, pix2pixmodel, whole_size_threshold):
 
     if model_type == 0:  # leres
         net_receptive_field_size = 448
-        patch_netsize = 2 * net_receptive_field_size
     elif model_type == 1:  # dpt_beit_large_512
         net_receptive_field_size = 512
-        patch_netsize = 2 * net_receptive_field_size
+    elif model_type == 11:  # depth_anything
+        net_receptive_field_size = 518
     else:  # other midas  # TODO Marigold support
         net_receptive_field_size = 384
-        patch_netsize = 2 * net_receptive_field_size
+    patch_netsize = 2 * net_receptive_field_size
+    # Good luck trying to use zoedepth
 
     gc.collect()
     backbone.torch_gc()
@@ -916,6 +973,8 @@ def singleestimate(img, msize, model, net_type):
         return estimateleres(img, model, msize, msize)
     elif net_type == 10:
         return estimatemarigold(img, model, msize, msize)
+    elif net_type == 11:
+        return estimatedepthanything(img, model, msize, msize)
     elif net_type >= 7:
         # np to PIL
         return estimatezoedepth(Image.fromarray(np.uint8(img * 255)).convert('RGB'), model, msize, msize)
diff --git a/src/misc.py b/src/misc.py
@@ -15,7 +15,7 @@ def get_commit_hash():
 
 REPOSITORY_NAME = "stable-diffusion-webui-depthmap-script"
 SCRIPT_NAME = "DepthMap"
-SCRIPT_VERSION = "v0.4.5"
+SCRIPT_VERSION = "v0.4.6"
 SCRIPT_FULL_NAME = f"{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})"