update video writer

Shengnan-Zhu · Shengnan-Zhu · commit 3999b5e41ca2 · 2025-01-23T21:48:17.000+08:00
diff --git a/app.py b/app.py
@@ -13,16 +13,15 @@
 # limitations under the License. 
 import gradio as gr
 
-
 import numpy as np
 import os
 import torch
 
 from video_depth_anything.video_depth import VideoDepthAnything
-from utils.dc_utils import read_video_frames, vis_sequence_depth, save_video
+from utils.dc_utils import read_video_frames, save_video
 
 examples = [
-    ['assets/example_videos/davis_rollercoaster.mp4'],
+    ['assets/example_videos/davis_rollercoaster.mp4', -1, -1, 1280],
 ]
 
 model_configs = {
@@ -46,17 +45,16 @@ def infer_video_depth(
     input_size: int = 518,
 ):
     frames, target_fps = read_video_frames(input_video, max_len, target_fps, max_res)
-    depth_list, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=input_size, device='cuda')
-    depth_list = np.stack(depth_list, axis=0)
-    vis = vis_sequence_depth(depth_list)
+    depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=input_size, device='cuda')
+
     video_name = os.path.basename(input_video)
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
     processed_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0]+'_src.mp4')
     depth_vis_path = os.path.join(output_dir, os.path.splitext(video_name)[0]+'_vis.mp4')
     save_video(frames, processed_video_path, fps=fps)
-    save_video(vis, depth_vis_path, fps=fps)
+    save_video(depths, depth_vis_path, fps=fps, is_depths=True)
 
     return [processed_video_path, depth_vis_path]
 
diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,8 @@ torchvision
 opencv-python
 matplotlib
 pillow
-mediapy
+imageio
+imageio-ffmpeg
 decord
 xformers
 einops
diff --git a/run.py b/run.py
@@ -17,7 +17,7 @@
 import torch
 
 from video_depth_anything.video_depth import VideoDepthAnything
-from utils.dc_utils import read_video_frames, vis_sequence_depth, save_video
+from utils.dc_utils import read_video_frames, save_video
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Video Depth Anything')
@@ -43,17 +43,16 @@
     video_depth_anything = video_depth_anything.to(DEVICE).eval()
 
     frames, target_fps = read_video_frames(args.input_video, args.max_len, args.target_fps, args.max_res)
-    depth_list, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=args.input_size, device=DEVICE)
-    depth_list = np.stack(depth_list, axis=0)
-    vis = vis_sequence_depth(depth_list)
+    depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=args.input_size, device=DEVICE)
+    
     video_name = os.path.basename(args.input_video)
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
 
     processed_video_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_src.mp4')
     depth_vis_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_vis.mp4')
     save_video(frames, processed_video_path, fps=fps)
-    save_video(vis, depth_vis_path, fps=fps)
+    save_video(depths, depth_vis_path, fps=fps, is_depths=True)
 
     
 
diff --git a/utils/dc_utils.py b/utils/dc_utils.py
@@ -3,31 +3,29 @@
 #
 # This file may have been modified by ByteDance Ltd. and/or its affiliates on [date of modification]
 # Original file is released under [ MIT License license], with the full license text available at [https://github.com/Tencent/DepthCrafter?tab=License-1-ov-file].
-from typing import Union, List
-import tempfile
 import numpy as np
-import PIL.Image
 import matplotlib.cm as cm
-import mediapy
-import torch
+import imageio
 try:
     from decord import VideoReader, cpu
     DECORD_AVAILABLE = True
 except:
     import cv2
     DECORD_AVAILABLE = False
 
+def ensure_even(value):
+    return value if value % 2 == 0 else value + 1
 
-def read_video_frames(video_path, process_length, target_fps=-1, max_res=-1, dataset="open"):
+def read_video_frames(video_path, process_length, target_fps=-1, max_res=-1):
     if DECORD_AVAILABLE:
         vid = VideoReader(video_path, ctx=cpu(0))
         original_height, original_width = vid.get_batch([0]).shape[1:3]
         height = original_height
         width = original_width
         if max_res > 0 and max(height, width) > max_res:
             scale = max_res / max(original_height, original_width)
-            height = round(original_height * scale)
-            width = round(original_width * scale)
+            height = ensure_even(round(original_height * scale))
+            width = ensure_even(round(original_width * scale))
 
         vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
 
@@ -71,46 +69,18 @@ def read_video_frames(video_path, process_length, target_fps=-1, max_res=-1, dat
     return frames, fps
 
 
-def save_video(
-    video_frames: Union[List[np.ndarray], List[PIL.Image.Image]],
-    output_video_path: str = None,
-    fps: int = 10,
-    crf: int = 18,
-) -> str:
-    if output_video_path is None:
-        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
-
-    if isinstance(video_frames[0], np.ndarray):
-        video_frames = [frame.astype(np.uint8) for frame in video_frames]
-
-    elif isinstance(video_frames[0], PIL.Image.Image):
-        video_frames = [np.array(frame) for frame in video_frames]
-    mediapy.write_video(output_video_path, video_frames, fps=fps, crf=crf)
-    return output_video_path
-
-
-class ColorMapper:
-    # a color mapper to map depth values to a certain colormap
-    def __init__(self, colormap: str = "inferno"):
-        self.colormap = torch.tensor(cm.get_cmap(colormap).colors)
-
-    def apply(self, image: torch.Tensor, v_min=None, v_max=None):
-        # assert len(image.shape) == 2
-        if v_min is None:
-            v_min = image.min()
-        if v_max is None:
-            v_max = image.max()
-        image = (image - v_min) / (v_max - v_min)
-        image = (image * 255).long()
-        image = self.colormap[image] * 255
-        return image
-
+def save_video(frames, output_video_path, fps=10, is_depths=False):
+    writer = imageio.get_writer(output_video_path, fps=fps, macro_block_size=1, codec='libx264', ffmpeg_params=['-crf', '18'])
+    if is_depths:
+        colormap = np.array(cm.get_cmap("inferno").colors)
+        d_min, d_max = frames.min(), frames.max()
+        for i in range(frames.shape[0]):
+            depth = frames[i]
+            depth_norm = ((depth - d_min) / (d_max - d_min) * 255).astype(np.uint8)
+            depth_vis = (colormap[depth_norm] * 255).astype(np.uint8)
+            writer.append_data(depth_vis)
+    else:
+        for i in range(frames.shape[0]):
+            writer.append_data(frames[i])
 
-def vis_sequence_depth(depths: np.ndarray, v_min=None, v_max=None):
-    visualizer = ColorMapper()
-    if v_min is None:
-        v_min = depths.min()
-    if v_max is None:
-        v_max = depths.max()
-    res = visualizer.apply(torch.tensor(depths), v_min=v_min, v_max=v_max).numpy()
-    return res
+    writer.close()
diff --git a/video_depth_anything/video_depth.py b/video_depth_anything/video_depth.py
@@ -150,5 +150,5 @@ def infer_video_depth(self, frames, target_fps, input_size=518, device='cuda'):
             
         depth_list = depth_list_aligned
             
-        return depth_list[:org_video_len], target_fps
+        return np.stack(depth_list[:org_video_len], axis=0), target_fps
         

Original file line number	Diff line number	Diff line change
`@@ -150,5 +150,5 @@ def infer_video_depth(self, frames, target_fps, input_size=518, device='cuda'):`
`150`	`150`
`151`	`151`	`depth_list = depth_list_aligned`
`152`	`152`
`153`		`- return depth_list[:org_video_len], target_fps`
	`153`	`+ return np.stack(depth_list[:org_video_len], axis=0), target_fps`
`154`	`154`