update README

Shengnan-Zhu · Shengnan-Zhu · commit 013235f27a3f · 2025-01-22T17:34:18.000+08:00
diff --git a/README.md b/README.md
@@ -44,6 +44,9 @@ pip install -r requirements.txt
 ```
 
 Download the checkpoints listed [here](#pre-trained-models) and put them under the `checkpoints` directory.
+```bash
+bash get_weights.sh
+```
 
 ### Inference a video
 ```bash
diff --git a/get_weights.sh b/get_weights.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+mkdir checkpoints
+cd checkpoints
+wget https://huggingface.co/depth-anything/Video-Depth-Anything-Small/resolve/main/video_depth_anything_vits.pth
+wget https://huggingface.co/depth-anything/Video-Depth-Anything-Large/resolve/main/video_depth_anything_vitl.pth
diff --git a/video_depth_anything/video_depth.py b/video_depth_anything/video_depth.py
@@ -65,6 +65,12 @@ def forward(self, x):
         return depth.squeeze(1).unflatten(0, (B, T)) # return shape [B, T, H, W]
     
     def infer_video_depth(self, frames, target_fps, input_size=518, device='cuda'):
+        frame_height, frame_width = frames[0].shape[:2]
+        ratio = max(frame_height, frame_width) / min(frame_height, frame_width)
+        if ratio > 1.78:  # we recommend to process video with ratio smaller than 16:9 due to memory limitation
+            input_size = int(input_size * 1.777 / ratio)
+            input_size = round(input_size / 14) * 14
+
         transform = Compose([
             Resize(
                 width=input_size,
@@ -79,7 +85,6 @@ def infer_video_depth(self, frames, target_fps, input_size=518, device='cuda'):
             PrepareForNet(),
         ])
 
-        frame_size = frames[0].shape[:2]
         frame_list = [frames[i] for i in range(frames.shape[0])]
         frame_step = INFER_LEN - OVERLAP
         org_video_len = len(frame_list)
@@ -99,7 +104,7 @@ def infer_video_depth(self, frames, target_fps, input_size=518, device='cuda'):
             with torch.no_grad():
                 depth = self.forward(cur_input) # depth shape: [1, T, H, W]
 
-            depth = F.interpolate(depth.flatten(0,1).unsqueeze(1), size=frame_size, mode='bilinear', align_corners=True)
+            depth = F.interpolate(depth.flatten(0,1).unsqueeze(1), size=(frame_height, frame_width), mode='bilinear', align_corners=True)
             depth_list += [depth[i][0].cpu().numpy() for i in range(depth.shape[0])]
 
             pre_input = cur_input