ali-vilab · thoounn · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025
diff --git a/vace/annotators/common.py b/vace/annotators/common.py
@@ -2,61 +2,61 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 class PlainImageAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, image):
         return image
 
 class PlainVideoAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, frames):
         return frames
 
 class PlainMaskAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, mask):
         return mask
 
 class PlainMaskAugInvertAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, mask):
         return 255 - mask
 
 class PlainMaskAugAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, mask):
         return mask
 
 class PlainMaskVideoAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, mask):
         return mask
 
 class PlainMaskAugVideoAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, masks):
         return masks
 
 class PlainMaskAugInvertVideoAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, masks):
         return [255 - mask for mask in masks]
 
 class ExpandMaskVideoAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, mask, expand_num):
         return [mask] * expand_num
 
 class PlainPromptAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, prompt):
         return prompt
diff --git a/vace/annotators/composition.py b/vace/annotators/composition.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 class CompositionAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         self.process_types = ["repaint", "extension", "control"]
         self.process_map = {
             "repaint": "repaint",
@@ -44,7 +44,7 @@ def forward(self, process_type_1, process_type_2, frames_1, frames_2, masks_1, m
 
 
 class ReferenceAnythingAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         from .subject import SubjectAnnotator
         self.sbjref_ins = SubjectAnnotator(cfg['SUBJECT'] if 'SUBJECT' in cfg else cfg)
         self.key_map = {
@@ -74,7 +74,7 @@ def forward(self, images, mode=None, return_mask=None, mask_cfg=None):
 
 
 class AnimateAnythingAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         from .pose import PoseBodyFaceVideoAnnotator
         self.pose_ins = PoseBodyFaceVideoAnnotator(cfg['POSE'])
         self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE'])
@@ -91,7 +91,7 @@ def forward(self, frames=None, images=None, mode=None, return_mask=None, mask_cf
 
 
 class SwapAnythingAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         from .inpainting import InpaintingVideoAnnotator
         self.inp_ins = InpaintingVideoAnnotator(cfg['INPAINTING'])
         self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE'])
@@ -110,7 +110,7 @@ def forward(self, video=None, frames=None, images=None, mode=None, mask=None, bb
 
 
 class ExpandAnythingAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         from .outpainting import OutpaintingAnnotator
         from .frameref import FrameRefExpandAnnotator
         self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE'])
@@ -137,7 +137,7 @@ def forward(self, images=None, mode=None, return_mask=None, mask_cfg=None, direc
 
 
 class MoveAnythingAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         from .layout import LayoutBboxAnnotator
         self.layout_bbox_ins = LayoutBboxAnnotator(cfg['LAYOUTBBOX'])
 

diff --git a/vace/annotators/depth.py b/vace/annotators/depth.py
@@ -10,7 +10,7 @@ class DepthAnnotator:
     def __init__(self, cfg, device=None):
         from .midas.api import MiDaSInference
         pretrained_model = cfg['PRETRAINED_MODEL']
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) 
         self.model = MiDaSInference(model_type='dpt_hybrid', model_path=pretrained_model).to(self.device)
         self.a = cfg.get('A', np.pi * 2.0)
         self.bg_th = cfg.get('BG_TH', 0.1)
@@ -53,7 +53,7 @@ class DepthV2Annotator:
     def __init__(self, cfg, device=None):
         from .depth_anything_v2.dpt import DepthAnythingV2
         pretrained_model = cfg['PRETRAINED_MODEL']
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device)
         self.model = DepthAnythingV2(encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024]).to(self.device)
         self.model.load_state_dict(
             torch.load(

diff --git a/vace/annotators/face.py b/vace/annotators/face.py
@@ -15,7 +15,7 @@ def __init__(self, cfg, device=None):
         self.return_dict = cfg.get('RETURN_DICT', False)
         self.multi_face = cfg.get('MULTI_FACE', True)
         pretrained_model = cfg['PRETRAINED_MODEL']
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device)
         self.device_id = self.device.index if self.device.type == 'cuda' else None
         ctx_id = self.device_id if self.device_id is not None else 0
         self.model = FaceAnalysis(name=cfg.MODEL_NAME, root=pretrained_model, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])

diff --git a/vace/annotators/flow.py b/vace/annotators/flow.py
@@ -24,7 +24,7 @@ def __init__(self, cfg, device=None):
         }
         params = argparse.Namespace(**params)
         pretrained_model = cfg['PRETRAINED_MODEL']
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device)
         self.model = RAFT(params)
         self.model.load_state_dict({k.replace('module.', ''): v for k, v in torch.load(pretrained_model, map_location="cpu", weights_only=True).items()})
         self.model = self.model.to(self.device).eval()

diff --git a/vace/annotators/gdino.py b/vace/annotators/gdino.py
@@ -23,7 +23,7 @@ def __init__(self, cfg, device=None):
         self.text_threshold = cfg.get('TEXT_THRESHOLD', 0.2)
         self.iou_threshold = cfg.get('IOU_THRESHOLD', 0.5)
         self.use_nms = cfg.get('USE_NMS', True)
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device)
         self.model = Model(model_config_path=grounding_dino_config_path,
                            model_checkpoint_path=grounding_dino_checkpoint_path,
                            device=self.device)

diff --git a/vace/annotators/gray.py b/vace/annotators/gray.py
@@ -7,7 +7,7 @@
 
 
 class GrayAnnotator:
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         pass
     def forward(self, image):
         image = convert_to_numpy(image)

diff --git a/vace/annotators/pose.py b/vace/annotators/pose.py
@@ -36,7 +36,7 @@ class PoseAnnotator:
     def __init__(self, cfg, device=None):
         onnx_det = cfg['DETECTION_MODEL']
         onnx_pose = cfg['POSE_MODEL']
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device)
         self.pose_estimation = Wholebody(onnx_det, onnx_pose, device=self.device)
         self.resize_size = cfg.get("RESIZE_SIZE", 1024)
         self.use_body = cfg.get('USE_BODY', True)

diff --git a/vace/annotators/prompt_extend.py b/vace/annotators/prompt_extend.py
@@ -10,7 +10,7 @@ def __init__(self, cfg, device=None):
         self.model_name = cfg.get('MODEL_NAME', "Qwen2.5_3B")
         self.is_vl = cfg.get('IS_VL', False)
         self.system_prompt = cfg.get('SYSTEM_PROMPT', None)
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device)
         self.device_id = self.device.index if self.device.type == 'cuda' else None
         rank = self.device_id if self.device_id is not None else 0
         if self.mode == "dashscope":

diff --git a/vace/annotators/ram.py b/vace/annotators/ram.py
@@ -22,7 +22,7 @@ def __init__(self, cfg, device=None):
         ram_checkpoint_path = cfg['PRETRAINED_MODEL']
         ram_type = cfg.get('RAM_TYPE', 'swin_l')
         self.return_lang = cfg.get('RETURN_LANG', ['en'])  # ['en', 'zh']
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device)
         self.model = ram_plus(pretrained=ram_checkpoint_path, image_size=image_size, vit=ram_type,
                               text_encoder_type=ram_tokenizer_path, delete_tag_index=delete_tag_index).eval().to(self.device)
         self.ram_transform = Compose([

diff --git a/vace/annotators/salient.py b/vace/annotators/salient.py
@@ -313,7 +313,7 @@ def __init__(self, cfg, device=None):
         self.return_image = cfg.get('RETURN_IMAGE', False)
         self.use_crop = cfg.get('USE_CROP', False)
         pretrained_model = cfg['PRETRAINED_MODEL']
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device)
         self.norm_mean = [0.485, 0.456, 0.406]
         self.norm_std = [0.229, 0.224, 0.225]
         self.norm_size = cfg.get('NORM_SIZE', [320, 320])

diff --git a/vace/annotators/sam.py b/vace/annotators/sam.py
@@ -18,7 +18,7 @@ def __init__(self, cfg, device=None):
         self.task_type = cfg.get('TASK_TYPE', 'input_box')
         self.return_mask = cfg.get('RETURN_MASK', False)
         self.transform = ResizeLongestSide(1024)
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device)
         seg_model = sam_model_registry[cfg.get('MODEL_NAME', 'vit_b')](checkpoint=cfg['PRETRAINED_MODEL']).eval().to(self.device)
         self.predictor = SamPredictor(seg_model)
 

diff --git a/vace/annotators/scribble.py b/vace/annotators/scribble.py
@@ -105,7 +105,7 @@ def __init__(self, cfg, device=None):
         n_residual_blocks = cfg.get('N_RESIDUAL_BLOCKS', 3)
         sigmoid = cfg.get('SIGMOID', True)
         pretrained_model = cfg['PRETRAINED_MODEL']
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device)
         self.model = ContourInference(input_nc, output_nc, n_residual_blocks,
                                       sigmoid)
         self.model.load_state_dict(torch.load(pretrained_model, weights_only=True))

diff --git a/vace/gradios/vace_preprocess_demo.py b/vace/gradios/vace_preprocess_demo.py
@@ -34,7 +34,7 @@ def dict_to_markdown_table(d):
 
 
 class VACEImageTag():
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         self.save_dir = os.path.join(cfg.save_dir, 'image')
         if not os.path.exists(self.save_dir):
             os.makedirs(self.save_dir)
@@ -320,7 +320,7 @@ def set_callbacks_image(self, **kwargs):
 
 
 class VACEVideoTag():
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         self.save_dir = os.path.join(cfg.save_dir, 'video')
         if not os.path.exists(self.save_dir):
             os.makedirs(self.save_dir)
@@ -651,7 +651,7 @@ def set_callbacks_video(self, **kwargs):
 
 
 class VACETagComposition():
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         self.save_dir = os.path.join(cfg.save_dir, 'composition')
         if not os.path.exists(self.save_dir):
             os.makedirs(self.save_dir)
@@ -805,7 +805,7 @@ def set_callbacks_composition(self, **kwargs):
 
 
 class VACEVideoTool():
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         self.save_dir = os.path.join(cfg.save_dir, 'video_tool')
         if not os.path.exists(self.save_dir):
             os.makedirs(self.save_dir)
@@ -1031,7 +1031,7 @@ def set_callbacks_video_tool(self, **kwargs):
 
 class VACETag():
 
-    def __init__(self, cfg):
+    def __init__(self, cfg, device=None):
         self.cfg = cfg
         self.save_dir = cfg.save_dir
         self.current_index = 0

diff --git a/vace/models/wan/wan_vace.py b/vace/models/wan/wan_vace.py
@@ -456,7 +456,7 @@ def __init__(
         self.ring_size = ring_size
         self.dynamic_load()
 
-        self.device = 'cpu' if torch.cuda.is_available() else 'cpu'
+        self.device = 'gpu' if torch.cuda.is_available() else 'cpu'
         self.vid_proc = VaceVideoProcessor(
             downsample=tuple([x * y for x, y in zip(config.vae_stride, config.patch_size)]),
             min_area=720 * 1280,