Merge pull request #67 from guohengkai/ghk/dev/streaming

Shengnan-Zhu · web-flow · commit 46b93ec5d85a · 2025-07-03T20:12:19.000+08:00
Support streaming mode (experimental feature)
diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@ This work presents **Video Depth Anything** based on [Depth Anything V2](https:/
 ![teaser](assets/teaser_video_v2.png)
 
 ## News
+- **2025-07-03:** 🚀🚀🚀 Release an experimental version of training-free **streaming video depth estimation**.
 - **2025-07-03:** Release our implementation of [training loss](https://github.com/DepthAnything/Video-Depth-Anything/tree/main/loss).
 - **2025-04-25:** 🌟🌟🌟 Release [metric depth model](https://github.com/DepthAnything/Video-Depth-Anything/tree/main/metric_depth) based on Video-Depth-Anything-Large.
 - **2025-04-05:** Our paper has been accepted for a **highlight** presentation at [CVPR 2025](https://cvpr.thecvf.com/) (13.5% of the accepted papers).
@@ -107,6 +108,24 @@ Options:
 - `--save_npz` (optional): Save the depth map in `npz` format.
 - `--save_exr` (optional): Save the depth map in `exr` format.
 
+### Inference a video using streaming mode (Experimental features)
+We implement an experimental streaming mode **without training**. In details, we save the hidden states of temporal attentions for each frames in the caches, and only send a single frame into our video depth model during inference by reusing these past hidden states in temporal attentions. We hack our pipeline to align the original inference setting in the offline mode. Due to the inevitable gap between training and testing, we observe a **performance drop** between the streaming model and the offline model (e.g. the `d1` of ScanNet drops from `0.926` to `0.836`). Finetuning the model in the streaming mode will greatly improve the performance. We leave it for future work.
+
+To run the streaming model:
+```bash
+python3 run_streaming.py --input_video ./assets/example_videos/davis_rollercoaster.mp4 --output_dir ./outputs_streaming --encoder vitl
+```
+Options:
+- `--input_video`: path of input video
+- `--output_dir`: path to save the output results
+- `--input_size` (optional): By default, we use input size `518` for model inference.
+- `--max_res` (optional): By default, we use maximum resolution `1280` for model inference.
+- `--encoder` (optional): `vits` for Video-Depth-Anything-V2-Small, `vitl` for Video-Depth-Anything-V2-Large.
+- `--max_len` (optional): maximum length of the input video, `-1` means no limit
+- `--target_fps` (optional): target fps of the input video, `-1` means the original fps
+- `--fp32` (optional): Use `fp32` precision for inference. By default, we use `fp16`.
+- `--grayscale` (optional): Save the grayscale depth map, without applying color palette.
+
 ### Training Loss
 Our training loss is in `loss/` directory. Please see the `loss/test_loss.py` for usage.
 
diff --git a/run_streaming.py b/run_streaming.py
@@ -0,0 +1,93 @@
+# Copyright (2025) Bytedance Ltd. and/or its affiliates 
+
+# Licensed under the Apache License, Version 2.0 (the "License"); 
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+
+#     http://www.apache.org/licenses/LICENSE-2.0 
+
+# Unless required by applicable law or agreed to in writing, software 
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+# See the License for the specific language governing permissions and 
+# limitations under the License. 
+import argparse
+import numpy as np
+import os
+import torch
+import time
+import cv2
+
+from video_depth_anything.video_depth_stream import VideoDepthAnything
+from utils.dc_utils import save_video
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Video Depth Anything')
+    parser.add_argument('--input_video', type=str, default='./assets/example_videos/davis_rollercoaster.mp4')
+    parser.add_argument('--output_dir', type=str, default='./outputs')
+    parser.add_argument('--input_size', type=int, default=518)
+    parser.add_argument('--max_res', type=int, default=1280)
+    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitl'])
+    parser.add_argument('--max_len', type=int, default=-1, help='maximum length of the input video, -1 means no limit')
+    parser.add_argument('--target_fps', type=int, default=-1, help='target fps of the input video, -1 means the original fps')
+    parser.add_argument('--fp32', action='store_true', help='model infer with torch.float32, default is torch.float16')
+    parser.add_argument('--grayscale', action='store_true', help='do not apply colorful palette')
+
+    args = parser.parse_args()
+
+    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    model_configs = {
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+    }
+
+    video_depth_anything = VideoDepthAnything(**model_configs[args.encoder])
+    video_depth_anything.load_state_dict(torch.load(f'./checkpoints/video_depth_anything_{args.encoder}.pth', map_location='cpu'), strict=True)
+    video_depth_anything = video_depth_anything.to(DEVICE).eval()
+
+    cap = cv2.VideoCapture(args.input_video)
+    original_fps = cap.get(cv2.CAP_PROP_FPS)
+    original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    if args.max_res > 0 and max(original_height, original_width) > args.max_res:
+        scale = args.max_res / max(original_height, original_width)
+        height = round(original_height * scale)
+        width = round(original_width * scale)
+
+    fps = original_fps if args.target_fps < 0 else args.target_fps
+
+    stride = max(round(original_fps / fps), 1)
+
+    depths = []
+    frame_count = 0
+    start = time.time()
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret or (args.max_len > 0 and frame_count >= args.max_len):
+            break
+        if frame_count % stride == 0:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
+            if args.max_res > 0 and max(original_height, original_width) > args.max_res:
+                frame = cv2.resize(frame, (width, height))  # Resize frame
+
+            # Inference depth
+            depth = video_depth_anything.infer_video_depth_one(frame, input_size=args.input_size, device=DEVICE, fp32=args.fp32)
+            depths.append(depth)
+        frame_count += 1
+        if frame_count % 50 == 0:
+            print(f"frame: {frame_count}/{total_frames}")
+    end = time.time()
+
+    cap.release()
+    print(f"time: {end - start}s")
+
+    video_name = os.path.basename(args.input_video)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    depth_vis_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_vis.mp4')
+    depths = np.stack(depths, axis=0)
+    save_video(depths, depth_vis_path, fps=fps, is_depths=True, grayscale=args.grayscale)
diff --git a/video_depth_anything/dpt_temporal.py b/video_depth_anything/dpt_temporal.py
@@ -50,7 +50,7 @@ def __init__(self,
                            **motion_module_kwargs)
         ])
 
-    def forward(self, out_features, patch_h, patch_w, frame_length, micro_batch_size=4):
+    def forward(self, out_features, patch_h, patch_w, frame_length, micro_batch_size=4, cached_hidden_state_list=None):
         out = []
         for i, x in enumerate(out_features):
             if self.use_clstoken:
@@ -71,19 +71,27 @@ def forward(self, out_features, patch_h, patch_w, frame_length, micro_batch_size
         layer_1, layer_2, layer_3, layer_4 = out
 
         B, T = layer_1.shape[0] // frame_length, frame_length
+        if cached_hidden_state_list is not None:
+            N = len(cached_hidden_state_list) // len(self.motion_modules)
+        else:
+            N = 0
 
-        layer_3 = self.motion_modules[0](layer_3.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
-        layer_4 = self.motion_modules[1](layer_4.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        layer_3, h0 = self.motion_modules[0](layer_3.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None, cached_hidden_state_list[0:N] if N else None)
+        layer_3 = layer_3.permute(0, 2, 1, 3, 4).flatten(0, 1)
+        layer_4, h1 = self.motion_modules[1](layer_4.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None, cached_hidden_state_list[N:2*N] if N else None)
+        layer_4 = layer_4.permute(0, 2, 1, 3, 4).flatten(0, 1)
 
         layer_1_rn = self.scratch.layer1_rn(layer_1)
         layer_2_rn = self.scratch.layer2_rn(layer_2)
         layer_3_rn = self.scratch.layer3_rn(layer_3)
         layer_4_rn = self.scratch.layer4_rn(layer_4)
 
         path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
-        path_4 = self.motion_modules[2](path_4.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        path_4, h2 = self.motion_modules[2](path_4.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None, cached_hidden_state_list[2*N:3*N] if N else None)
+        path_4 = path_4.permute(0, 2, 1, 3, 4).flatten(0, 1)
         path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
-        path_3 = self.motion_modules[3](path_3.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        path_3, h3 = self.motion_modules[3](path_3.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None, cached_hidden_state_list[3*N:] if N else None)
+        path_3 = path_3.permute(0, 2, 1, 3, 4).flatten(0, 1)
 
         batch_size = layer_1_rn.shape[0]
         if batch_size <= micro_batch_size or batch_size % micro_batch_size != 0:
@@ -97,7 +105,8 @@ def forward(self, out_features, patch_h, patch_w, frame_length, micro_batch_size
             ori_type = out.dtype
             with torch.autocast(device_type="cuda", enabled=False):
                 out = self.scratch.output_conv2(out.float())
-            return out.to(ori_type)
+
+            output = out.to(ori_type) 
         else:
             ret = []
             for i in range(0, batch_size, micro_batch_size):
@@ -111,4 +120,6 @@ def forward(self, out_features, patch_h, patch_w, frame_length, micro_batch_size
                 with torch.autocast(device_type="cuda", enabled=False):
                     out = self.scratch.output_conv2(out.float())
                 ret.append(out.to(ori_type))
-            return torch.cat(ret, dim=0)
+            output = torch.cat(ret, dim=0)
+        
+        return output, h0 + h1 + h2 + h3
diff --git a/video_depth_anything/motion_module/motion_module.py b/video_depth_anything/motion_module/motion_module.py
@@ -57,12 +57,12 @@ def __init__(
         if zero_initialize:
             self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
 
-    def forward(self, input_tensor, encoder_hidden_states, attention_mask=None):
+    def forward(self, input_tensor, encoder_hidden_states, attention_mask=None, cached_hidden_state_list=None):
         hidden_states = input_tensor
-        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
+        hidden_states, output_hidden_state_list = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask, cached_hidden_state_list)
 
         output = hidden_states
-        return output
+        return output, output_hidden_state_list  # list of hidden states
 
 
 class TemporalTransformer3DModel(nn.Module):
@@ -99,8 +99,10 @@ def __init__(
         )
         self.proj_out = nn.Linear(inner_dim, in_channels)
 
-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, cached_hidden_state_list=None):
         assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        output_hidden_state_list = []
+
         video_length = hidden_states.shape[2]
         hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
 
@@ -113,8 +115,14 @@ def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None
         hidden_states = self.proj_in(hidden_states)
 
         # Transformer Blocks
-        for block in self.transformer_blocks:
-            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length, attention_mask=attention_mask)
+        if cached_hidden_state_list is not None:
+            n = len(cached_hidden_state_list) // len(self.transformer_blocks)
+        else:
+            n = 0
+        for i, block in enumerate(self.transformer_blocks):
+            hidden_states, hidden_state_list = block(hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length, attention_mask=attention_mask,
+                                                     cached_hidden_state_list=cached_hidden_state_list[i*n:(i+1)*n] if n else None)
+            output_hidden_state_list.extend(hidden_state_list)
 
         # output
         hidden_states = self.proj_out(hidden_states)
@@ -123,7 +131,7 @@ def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None
         output = hidden_states + residual
         output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
 
-        return output
+        return output, output_hidden_state_list
 
 
 class TemporalTransformerBlock(nn.Module):
@@ -161,20 +169,24 @@ def __init__(
         self.ff_norm = nn.LayerNorm(dim)
 
 
-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
-        for attention_block, norm in zip(self.attention_blocks, self.norms):
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None, cached_hidden_state_list=None):
+        output_hidden_state_list = []
+        for i, (attention_block, norm) in enumerate(zip(self.attention_blocks, self.norms)):
             norm_hidden_states = norm(hidden_states)
-            hidden_states = attention_block(
+            residual_hidden_states, output_hidden_states = attention_block(
                 norm_hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
                 video_length=video_length,
                 attention_mask=attention_mask,
-            ) + hidden_states
+                cached_hidden_states=cached_hidden_state_list[i] if cached_hidden_state_list is not None else None,
+            )
+            hidden_states = residual_hidden_states + hidden_states
+            output_hidden_state_list.append(output_hidden_states)
 
         hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
 
         output = hidden_states
-        return output
+        return output, output_hidden_state_list
 
 
 class PositionalEncoding(nn.Module):
@@ -227,9 +239,21 @@ def __init__(
         else:
             raise NotImplementedError
 
-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None, cached_hidden_states=None):
+        # TODO: support cache for these
+        assert encoder_hidden_states is None
+        assert attention_mask is None
+
         d = hidden_states.shape[1]
-        hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+        d_in = 0
+        if cached_hidden_states is None:
+            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+            input_hidden_states = hidden_states  # (bxd) f c
+        else:
+            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=1)
+            input_hidden_states = hidden_states
+            d_in = cached_hidden_states.shape[1]
+            hidden_states = torch.cat([cached_hidden_states, hidden_states], dim=1)
 
         if self.pos_encoder is not None:
             hidden_states = self.pos_encoder(hidden_states)
@@ -239,7 +263,7 @@ def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None
         if self.group_norm is not None:
             hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
 
-        query = self.to_q(hidden_states)
+        query = self.to_q(hidden_states[:, d_in:, ...])
         dim = query.shape[-1]
 
         if self.added_kv_proj_dim is not None:
@@ -294,4 +318,4 @@ def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None
 
         hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
 
-        return hidden_states
+        return hidden_states, input_hidden_states
diff --git a/video_depth_anything/video_depth.py b/video_depth_anything/video_depth.py
@@ -1,16 +1,16 @@
-# Copyright (2025) Bytedance Ltd. and/or its affiliates 
+# Copyright (2025) Bytedance Ltd. and/or its affiliates
 
-# Licensed under the Apache License, Version 2.0 (the "License"); 
-# you may not use this file except in compliance with the License. 
-# You may obtain a copy of the License at 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 
-#     http://www.apache.org/licenses/LICENSE-2.0 
+#     http://www.apache.org/licenses/LICENSE-2.0
 
-# Unless required by applicable law or agreed to in writing, software 
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
-# See the License for the specific language governing permissions and 
-# limitations under the License. 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import torch
 import torch.nn.functional as F
 import torch.nn as nn
@@ -36,9 +36,9 @@ class VideoDepthAnything(nn.Module):
     def __init__(
         self,
         encoder='vitl',
-        features=256, 
-        out_channels=[256, 512, 1024, 1024], 
-        use_bn=False, 
+        features=256,
+        out_channels=[256, 512, 1024, 1024],
+        use_bn=False,
         use_clstoken=False,
         num_frames=32,
         pe='ape'
@@ -49,7 +49,7 @@ def __init__(
             'vits': [2, 5, 8, 11],
             'vitl': [4, 11, 17, 23]
         }
-        
+
         self.encoder = encoder
         self.pretrained = DINOv2(model_name=encoder)
 
@@ -59,11 +59,11 @@ def forward(self, x):
         B, T, C, H, W = x.shape
         patch_h, patch_w = H // 14, W // 14
         features = self.pretrained.get_intermediate_layers(x.flatten(0,1), self.intermediate_layer_idx[self.encoder], return_class_token=True)
-        depth = self.head(features, patch_h, patch_w, T)
+        depth = self.head(features, patch_h, patch_w, T)[0]
         depth = F.interpolate(depth, size=(H, W), mode="bilinear", align_corners=True)
         depth = F.relu(depth)
         return depth.squeeze(1).unflatten(0, (B, T)) # return shape [B, T, H, W]
-    
+
     def infer_video_depth(self, frames, target_fps, input_size=518, device='cuda', fp32=False):
         frame_height, frame_width = frames[0].shape[:2]
         ratio = max(frame_height, frame_width) / min(frame_height, frame_width)
@@ -90,7 +90,7 @@ def infer_video_depth(self, frames, target_fps, input_size=518, device='cuda', f
         org_video_len = len(frame_list)
         append_frame_len = (frame_step - (org_video_len % frame_step)) % frame_step + (INFER_LEN - frame_step)
         frame_list = frame_list + [frame_list[-1].copy()] * append_frame_len
-        
+
         depth_list = []
         pre_input = None
         for frame_id in tqdm(range(0, org_video_len, frame_step)):
@@ -149,8 +149,8 @@ def infer_video_depth(self, frames, target_fps, input_size=518, device='cuda', f
                     new_depth = depth_list[frame_id+kf_id] * scale + shift
                     new_depth[new_depth<0] = 0
                     ref_align.append(new_depth)
-            
+
         depth_list = depth_list_aligned
-            
+
         return np.stack(depth_list[:org_video_len], axis=0), target_fps
-        
+
diff --git a/video_depth_anything/video_depth_stream.py b/video_depth_anything/video_depth_stream.py