diff --git a/vace/annotators/common.py b/vace/annotators/common.py index f52cec9..f956a9a 100644 --- a/vace/annotators/common.py +++ b/vace/annotators/common.py @@ -2,61 +2,61 @@ # Copyright (c) Alibaba, Inc. and its affiliates. class PlainImageAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, image): return image class PlainVideoAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, frames): return frames class PlainMaskAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, mask): return mask class PlainMaskAugInvertAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, mask): return 255 - mask class PlainMaskAugAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, mask): return mask class PlainMaskVideoAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, mask): return mask class PlainMaskAugVideoAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, masks): return masks class PlainMaskAugInvertVideoAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, masks): return [255 - mask for mask in masks] class ExpandMaskVideoAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, mask, expand_num): return [mask] * expand_num class PlainPromptAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, prompt): return prompt \ No newline at end of file diff --git a/vace/annotators/composition.py b/vace/annotators/composition.py index fa63a92..6f42970 100644 --- a/vace/annotators/composition.py +++ b/vace/annotators/composition.py @@ -3,7 +3,7 @@ import numpy as np class CompositionAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): self.process_types = ["repaint", "extension", "control"] self.process_map = { "repaint": "repaint", @@ -44,7 +44,7 @@ def forward(self, process_type_1, process_type_2, frames_1, frames_2, masks_1, m class ReferenceAnythingAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): from .subject import SubjectAnnotator self.sbjref_ins = SubjectAnnotator(cfg['SUBJECT'] if 'SUBJECT' in cfg else cfg) self.key_map = { @@ -74,7 +74,7 @@ def forward(self, images, mode=None, return_mask=None, mask_cfg=None): class AnimateAnythingAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): from .pose import PoseBodyFaceVideoAnnotator self.pose_ins = PoseBodyFaceVideoAnnotator(cfg['POSE']) self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE']) @@ -91,7 +91,7 @@ def forward(self, frames=None, images=None, mode=None, return_mask=None, mask_cf class SwapAnythingAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): from .inpainting import InpaintingVideoAnnotator self.inp_ins = InpaintingVideoAnnotator(cfg['INPAINTING']) self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE']) @@ -110,7 +110,7 @@ def forward(self, video=None, frames=None, images=None, mode=None, mask=None, bb class ExpandAnythingAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): from .outpainting import OutpaintingAnnotator from .frameref import FrameRefExpandAnnotator self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE']) @@ -137,7 +137,7 @@ def forward(self, images=None, mode=None, return_mask=None, mask_cfg=None, direc class MoveAnythingAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): from .layout import LayoutBboxAnnotator self.layout_bbox_ins = LayoutBboxAnnotator(cfg['LAYOUTBBOX']) diff --git a/vace/annotators/depth.py b/vace/annotators/depth.py index 3e709bf..de8280c 100644 --- a/vace/annotators/depth.py +++ b/vace/annotators/depth.py @@ -10,7 +10,7 @@ class DepthAnnotator: def __init__(self, cfg, device=None): from .midas.api import MiDaSInference pretrained_model = cfg['PRETRAINED_MODEL'] - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) self.model = MiDaSInference(model_type='dpt_hybrid', model_path=pretrained_model).to(self.device) self.a = cfg.get('A', np.pi * 2.0) self.bg_th = cfg.get('BG_TH', 0.1) @@ -53,7 +53,7 @@ class DepthV2Annotator: def __init__(self, cfg, device=None): from .depth_anything_v2.dpt import DepthAnythingV2 pretrained_model = cfg['PRETRAINED_MODEL'] - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) self.model = DepthAnythingV2(encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024]).to(self.device) self.model.load_state_dict( torch.load( diff --git a/vace/annotators/face.py b/vace/annotators/face.py index 523e1bc..a0564a7 100644 --- a/vace/annotators/face.py +++ b/vace/annotators/face.py @@ -15,7 +15,7 @@ def __init__(self, cfg, device=None): self.return_dict = cfg.get('RETURN_DICT', False) self.multi_face = cfg.get('MULTI_FACE', True) pretrained_model = cfg['PRETRAINED_MODEL'] - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) self.device_id = self.device.index if self.device.type == 'cuda' else None ctx_id = self.device_id if self.device_id is not None else 0 self.model = FaceAnalysis(name=cfg.MODEL_NAME, root=pretrained_model, providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) diff --git a/vace/annotators/flow.py b/vace/annotators/flow.py index c5494f5..29ca132 100644 --- a/vace/annotators/flow.py +++ b/vace/annotators/flow.py @@ -24,7 +24,7 @@ def __init__(self, cfg, device=None): } params = argparse.Namespace(**params) pretrained_model = cfg['PRETRAINED_MODEL'] - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) self.model = RAFT(params) self.model.load_state_dict({k.replace('module.', ''): v for k, v in torch.load(pretrained_model, map_location="cpu", weights_only=True).items()}) self.model = self.model.to(self.device).eval() diff --git a/vace/annotators/gdino.py b/vace/annotators/gdino.py index 578bae5..b83bdb0 100644 --- a/vace/annotators/gdino.py +++ b/vace/annotators/gdino.py @@ -23,7 +23,7 @@ def __init__(self, cfg, device=None): self.text_threshold = cfg.get('TEXT_THRESHOLD', 0.2) self.iou_threshold = cfg.get('IOU_THRESHOLD', 0.5) self.use_nms = cfg.get('USE_NMS', True) - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) self.model = Model(model_config_path=grounding_dino_config_path, model_checkpoint_path=grounding_dino_checkpoint_path, device=self.device) diff --git a/vace/annotators/gray.py b/vace/annotators/gray.py index 61a2313..de96c68 100644 --- a/vace/annotators/gray.py +++ b/vace/annotators/gray.py @@ -7,7 +7,7 @@ class GrayAnnotator: - def __init__(self, cfg): + def __init__(self, cfg, device=None): pass def forward(self, image): image = convert_to_numpy(image) diff --git a/vace/annotators/pose.py b/vace/annotators/pose.py index dcb404e..944776d 100644 --- a/vace/annotators/pose.py +++ b/vace/annotators/pose.py @@ -36,7 +36,7 @@ class PoseAnnotator: def __init__(self, cfg, device=None): onnx_det = cfg['DETECTION_MODEL'] onnx_pose = cfg['POSE_MODEL'] - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) self.pose_estimation = Wholebody(onnx_det, onnx_pose, device=self.device) self.resize_size = cfg.get("RESIZE_SIZE", 1024) self.use_body = cfg.get('USE_BODY', True) diff --git a/vace/annotators/prompt_extend.py b/vace/annotators/prompt_extend.py index 8e47a7b..2cdc43c 100644 --- a/vace/annotators/prompt_extend.py +++ b/vace/annotators/prompt_extend.py @@ -10,7 +10,7 @@ def __init__(self, cfg, device=None): self.model_name = cfg.get('MODEL_NAME', "Qwen2.5_3B") self.is_vl = cfg.get('IS_VL', False) self.system_prompt = cfg.get('SYSTEM_PROMPT', None) - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) self.device_id = self.device.index if self.device.type == 'cuda' else None rank = self.device_id if self.device_id is not None else 0 if self.mode == "dashscope": diff --git a/vace/annotators/ram.py b/vace/annotators/ram.py index 1bc4712..3b2efbe 100644 --- a/vace/annotators/ram.py +++ b/vace/annotators/ram.py @@ -22,7 +22,7 @@ def __init__(self, cfg, device=None): ram_checkpoint_path = cfg['PRETRAINED_MODEL'] ram_type = cfg.get('RAM_TYPE', 'swin_l') self.return_lang = cfg.get('RETURN_LANG', ['en']) # ['en', 'zh'] - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) self.model = ram_plus(pretrained=ram_checkpoint_path, image_size=image_size, vit=ram_type, text_encoder_type=ram_tokenizer_path, delete_tag_index=delete_tag_index).eval().to(self.device) self.ram_transform = Compose([ diff --git a/vace/annotators/salient.py b/vace/annotators/salient.py index 9584f1d..3124fdd 100644 --- a/vace/annotators/salient.py +++ b/vace/annotators/salient.py @@ -313,7 +313,7 @@ def __init__(self, cfg, device=None): self.return_image = cfg.get('RETURN_IMAGE', False) self.use_crop = cfg.get('USE_CROP', False) pretrained_model = cfg['PRETRAINED_MODEL'] - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) self.norm_mean = [0.485, 0.456, 0.406] self.norm_std = [0.229, 0.224, 0.225] self.norm_size = cfg.get('NORM_SIZE', [320, 320]) diff --git a/vace/annotators/sam.py b/vace/annotators/sam.py index 1246c61..08f8b33 100644 --- a/vace/annotators/sam.py +++ b/vace/annotators/sam.py @@ -18,7 +18,7 @@ def __init__(self, cfg, device=None): self.task_type = cfg.get('TASK_TYPE', 'input_box') self.return_mask = cfg.get('RETURN_MASK', False) self.transform = ResizeLongestSide(1024) - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) seg_model = sam_model_registry[cfg.get('MODEL_NAME', 'vit_b')](checkpoint=cfg['PRETRAINED_MODEL']).eval().to(self.device) self.predictor = SamPredictor(seg_model) diff --git a/vace/annotators/scribble.py b/vace/annotators/scribble.py index 41c5e79..b811982 100644 --- a/vace/annotators/scribble.py +++ b/vace/annotators/scribble.py @@ -105,7 +105,7 @@ def __init__(self, cfg, device=None): n_residual_blocks = cfg.get('N_RESIDUAL_BLOCKS', 3) sigmoid = cfg.get('SIGMOID', True) pretrained_model = cfg['PRETRAINED_MODEL'] - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else (torch.device(device) if isinstance(device, str) else device) self.model = ContourInference(input_nc, output_nc, n_residual_blocks, sigmoid) self.model.load_state_dict(torch.load(pretrained_model, weights_only=True)) diff --git a/vace/gradios/vace_preprocess_demo.py b/vace/gradios/vace_preprocess_demo.py index 1ee929f..7c33c7c 100644 --- a/vace/gradios/vace_preprocess_demo.py +++ b/vace/gradios/vace_preprocess_demo.py @@ -34,7 +34,7 @@ def dict_to_markdown_table(d): class VACEImageTag(): - def __init__(self, cfg): + def __init__(self, cfg, device=None): self.save_dir = os.path.join(cfg.save_dir, 'image') if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) @@ -320,7 +320,7 @@ def set_callbacks_image(self, **kwargs): class VACEVideoTag(): - def __init__(self, cfg): + def __init__(self, cfg, device=None): self.save_dir = os.path.join(cfg.save_dir, 'video') if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) @@ -651,7 +651,7 @@ def set_callbacks_video(self, **kwargs): class VACETagComposition(): - def __init__(self, cfg): + def __init__(self, cfg, device=None): self.save_dir = os.path.join(cfg.save_dir, 'composition') if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) @@ -805,7 +805,7 @@ def set_callbacks_composition(self, **kwargs): class VACEVideoTool(): - def __init__(self, cfg): + def __init__(self, cfg, device=None): self.save_dir = os.path.join(cfg.save_dir, 'video_tool') if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) @@ -1031,7 +1031,7 @@ def set_callbacks_video_tool(self, **kwargs): class VACETag(): - def __init__(self, cfg): + def __init__(self, cfg, device=None): self.cfg = cfg self.save_dir = cfg.save_dir self.current_index = 0 diff --git a/vace/models/wan/wan_vace.py b/vace/models/wan/wan_vace.py index d388c50..497b544 100644 --- a/vace/models/wan/wan_vace.py +++ b/vace/models/wan/wan_vace.py @@ -456,7 +456,7 @@ def __init__( self.ring_size = ring_size self.dynamic_load() - self.device = 'cpu' if torch.cuda.is_available() else 'cpu' + self.device = 'gpu' if torch.cuda.is_available() else 'cpu' self.vid_proc = VaceVideoProcessor( downsample=tuple([x * y for x, y in zip(config.vae_stride, config.patch_size)]), min_area=720 * 1280,