diff --git a/arkit_dataset.py b/arkit_dataset.py index 389bac7..a7c6589 100644 --- a/arkit_dataset.py +++ b/arkit_dataset.py @@ -196,6 +196,7 @@ def __getitem__(self, idx): if self.use_gdino: pred_box, pred_label = self.get_pred_det(frame_id) data['pred_bbox'] = pred_box + data['pred_label'] = pred_label # images: 2 x H x W x 3 # detections: 2 x N x 4, N can be not the same @@ -339,7 +340,7 @@ def arkit_collate_fn(batch): ret['pred_bbox'] = batch_pred_bbox ret['pred_bbox_mask'] = batch_pred_bbox_mask - + ret['pred_label'] = pad_sequence([item['pred_label'] for item in batch], batch_first=True, padding_value=-1) return ret diff --git a/configs/experiments/aomsg.yaml b/configs/experiments/aomsg.yaml index a5afb81..7795059 100644 --- a/configs/experiments/aomsg.yaml +++ b/configs/experiments/aomsg.yaml @@ -1,6 +1,6 @@ # sample style training script dataset_path: './data/msg' -device: 1 +device: 0 pp_threshold: 0.3 object_threshold: 0.2 @@ -22,8 +22,8 @@ wandb: True resume: False resume_path: './path/to/checkpoints/0+.pth' # for eval -eval_output_dir: './exp-results/aomsg/2024-05-14_22-26-52' # this is specific to trained checkpoints -eval_chkpt: 29-step22470+.pth #null for no checkpoint +eval_output_dir: './exp-results/aomsg' # this is specific to trained checkpoints +eval_chkpt: aomsg-s-4.pth #null for no checkpoint save_every: True # if save specific results for every video eval_step: 100 diff --git a/configs/experiments/aomsg_gdino.yaml b/configs/experiments/aomsg_gdino.yaml index 78b023b..1330fc4 100644 --- a/configs/experiments/aomsg_gdino.yaml +++ b/configs/experiments/aomsg_gdino.yaml @@ -14,7 +14,7 @@ loss_params: eval_split: Test train_split: Training -output_dir: './exp-results/gdino' +output_dir: './exp-results/aomsg-gdino' output_file: 'train' wandb: False @@ -23,7 +23,7 @@ resume: False resume_path: './path/to/checkpoints/0+.pth' eval_output_dir: './exp-results/emb_ckpt' # this is specific to trained checkpoints -eval_chkpt: aomsg_s_4.pth #29-step44940+.pth #null for no checkpoint +eval_chkpt: aomsg-s-4.pth #29-step44940+.pth #null for no checkpoint save_every: True # if save specific results for every video eval_step: 100 diff --git a/configs/experiments/direct_dinobase.yaml b/configs/experiments/direct_dinobase.yaml new file mode 100644 index 0000000..2c870e0 --- /dev/null +++ b/configs/experiments/direct_dinobase.yaml @@ -0,0 +1,41 @@ +dataset_path: './data/msg' +device: 0 +pp_threshold: 0.5 +object_threshold: 0.3 + +eval_split: Test +# train_split: Training + +output_dir: './exp-results/direct-dinov3-base' +output_file: 'direct-dinov3-base' +# wandb: True + +resume: False + +eval_output_dir: './exp-results/direct-dinov3-base' +eval_chkpt: null + +save_every: True # if save specific results for every video + +num_workers: 4 +# train_bs: 32 +eval_bs: 64 # debug test + + +obj_embedder: + model: "dinov3-base" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50' + weights: DEFAULT + freeze: True + output_type: mean + +place_embedder: + model: "dinov3-base" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base" + weights: DEFAULT + freeze: True + output_type: cls # mean, cls, feature, vec + +associator: + model: "SepMSG-direct" # "AoMSG-S-2" + object_dim: 384 # FYI dinov2-small 384, dinov2-base 768 + place_dim: 384 + output_dim: null \ No newline at end of file diff --git a/configs/experiments/direct_dinolarge.yaml b/configs/experiments/direct_dinolarge.yaml new file mode 100644 index 0000000..b7c58be --- /dev/null +++ b/configs/experiments/direct_dinolarge.yaml @@ -0,0 +1,41 @@ +dataset_path: './data/msg' +device: 0 +pp_threshold: 0.5 +object_threshold: 0.3 + +eval_split: Test +# train_split: Training + +output_dir: './exp-results/direct-dinov3-large' +output_file: 'direct-dinov3-large' +# wandb: True + +resume: False + +eval_output_dir: './exp-results/direct-dinov3-large' +eval_chkpt: null + +save_every: True # if save specific results for every video + +num_workers: 4 +# train_bs: 32 +eval_bs: 64 # debug test + + +obj_embedder: + model: "dinov3-large" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50' + weights: DEFAULT + freeze: True + output_type: mean + +place_embedder: + model: "dinov3-large" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base" + weights: DEFAULT + freeze: True + output_type: cls # mean, cls, feature, vec + +associator: + model: "SepMSG-direct" # "AoMSG-S-2" + object_dim: 384 # FYI dinov2-small 384, dinov2-base 768 + place_dim: 384 + output_dim: null \ No newline at end of file diff --git a/configs/experiments/direct_dinosmall.yaml b/configs/experiments/direct_dinosmall.yaml new file mode 100644 index 0000000..f76026a --- /dev/null +++ b/configs/experiments/direct_dinosmall.yaml @@ -0,0 +1,41 @@ +dataset_path: './data/msg' +device: 0 +pp_threshold: 0.5 +object_threshold: 0.3 + +eval_split: Test +# train_split: Training + +output_dir: './exp-results/direct-dinov3-small' +output_file: 'direct-dinov3-small' +# wandb: True + +resume: False + +eval_output_dir: './exp-results/direct-dinov3-small' +eval_chkpt: null + +save_every: True # if save specific results for every video + +num_workers: 4 +# train_bs: 32 +eval_bs: 64 # debug test + + +obj_embedder: + model: "dinov3-small" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50' + weights: DEFAULT + freeze: True + output_type: mean + +place_embedder: + model: "dinov3-small" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base" + weights: DEFAULT + freeze: True + output_type: cls # mean, cls, feature, vec + +associator: + model: "SepMSG-direct" # "AoMSG-S-2" + object_dim: 384 # FYI dinov2-small 384, dinov2-base 768 + place_dim: 384 + output_dim: null \ No newline at end of file diff --git a/configs/experiments/direct_gdino.yaml b/configs/experiments/direct_gdino.yaml new file mode 100644 index 0000000..a78ef84 --- /dev/null +++ b/configs/experiments/direct_gdino.yaml @@ -0,0 +1,49 @@ +dataset_path: './data/msg' +device: 0 +pp_threshold: 0.5 +object_threshold: 0.3 + +eval_split: Test +# train_split: Training + +output_dir: './exp-results/direct-gdino' +output_file: 'direct-gdino' +# wandb: True + +resume: False + +eval_output_dir: './exp-results/direct-gdino' +eval_chkpt: null + +save_every: True # if save specific results for every video + +num_workers: 4 +# train_bs: 32 +eval_bs: 64 # debug test + +#direct-grounding-dino +detector: + model: grounding-dino # "grounding-dino", "fasterrcnn", "gt", "fasterrcnnv2" + num_classes: 18 + freeze: True + weights: DEFAULT + pre_saved: True + result_path: "./exp-results/gdino-direct" + +obj_embedder: + model: "dinov2-small" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50' + weights: DEFAULT + freeze: True + output_type: mean + +place_embedder: + model: "dinov2-small" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base" + weights: DEFAULT + freeze: True + output_type: cls # mean, cls, feature, vec + +associator: + model: "SepMSG-direct" # "AoMSG-S-2" + object_dim: 384 # FYI dinov2-small 384, dinov2-base 768 + place_dim: 384 + output_dim: null \ No newline at end of file diff --git a/eval.py b/eval.py index ba7286a..8a350d3 100644 --- a/eval.py +++ b/eval.py @@ -69,7 +69,6 @@ def eval_per_video(dataset, dataloader, config, mapper, model, device, backproc, with torch.no_grad(): for batch in tqdm(dataloader): - # print(batch) images = batch['image'].to(device) # potentially pass more information to the model additional_info = { @@ -85,6 +84,7 @@ def eval_per_video(dataset, dataloader, config, mapper, model, device, backproc, if 'pred_bbox' in batch: additional_info['pred_bbox'] = batch['pred_bbox'].to(device) additional_info['pred_bbox_mask'] = batch['pred_bbox_mask'].to(device) + additional_info['pred_label'] = batch['pred_label'].to(device) results = model(images, additional_info) diff --git a/models/encoders.py b/models/encoders.py index 98c43c2..43b07f1 100644 --- a/models/encoders.py +++ b/models/encoders.py @@ -6,7 +6,7 @@ from torchvision.models import resnet50, resnet18, convnext_tiny, convnext_small, convnext_base # from transformers import AutoImageProcessor, ConvNextModel -from transformers import Dinov2Model, ViTModel +from transformers import Dinov2Model, ViTModel, AutoModel, AutoImageProcessor class ResNetEmbedder(nn.Module): @@ -218,6 +218,93 @@ def forward(self, pixel_values): # print("dino emb", embs.size()) # embs = last_hidden_states[:, 1:, :].mean(dim=1) # B x h, NOTE: its an average over all image tokens embs = self.adaptor(embs) + return embs + + +class DINOv3Embedder(nn.Module): + # output type: cls or mean of the tokens, or perhaps tokens? + def __init__(self, model_type, output_type='mean', freeze=True, weights="DEFAULT"): + super().__init__() + repo_path = '/home/zw4269/dev/dinov3' + + # DINOv3 ViT models pretrained on web images + dinov3_vits16 = torch.hub.load(repo_path, 'dinov3_vits16', source='local', weights='./exp-results/emb_ckpt/dino3/dinov3_vits16_pretrain_lvd1689m-08c60483.pth') + dinov3_vitb16 = torch.hub.load(repo_path, 'dinov3_vitb16', source='local', weights='./exp-results/emb_ckpt/dino3/dinov3_vitb16_pretrain_lvd1689m-73cec8be.pth') + dinov3_vitl16 = torch.hub.load(repo_path, 'dinov3_vitl16', source='local', weights='./exp-results/emb_ckpt/dino3/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth') + + if model_type == "dinov3_vits16": + self.dino_model = dinov3_vits16 + elif model_type == "dinov3_vitb16": + self.dino_model = dinov3_vitb16 + elif model_type == "dinov3_vitl16": + self.dino_model = dinov3_vitl16 + else: + raise NotImplementedError(model_type) + + if freeze: + for param in self.dino_model.parameters(): + param.requires_grad = False + self.dino_model.eval() + + # Get feature dimension from the model + self.feature_dim = self.dino_model.embed_dim + self.adaptor = nn.Identity() + + if output_type == 'mean': + print("use mean") + self.projection = self.mean_projection + elif output_type == 'cls': + self.projection = self.cls_projection + elif output_type == 'feature': + self.projection = self.seq_projection + elif output_type == 'max': + print("use max") + self.projection = self.max_projection + elif output_type.startswith('gem'): # for example: gem_3 + print("use", output_type) + p = float(int(output_type.split('_')[1])) + self.projection = self.gem_projection + self.p = nn.Parameter(torch.ones(1) * p) + if freeze: + self.p.requires_grad = False + else: + raise NotImplementedError + + def mean_projection(self, x): + if isinstance(x, dict): + x = x['x_norm_patchtokens'] + return x.mean(dim=1) + + def max_projection(self, x): + if isinstance(x, dict): + x = x['x_norm_patchtokens'] + x_bhl = x.permute(0, 2, 1) + xp_bh = F.adaptive_max_pool1d(x_bhl, output_size=1).squeeze(2) + return xp_bh + + def gem_projection(self, x, eps=1e-6): + if isinstance(x, dict): + x = x['x_norm_patchtokens'] + x_clamped = F.relu(x).clamp(min=eps) + gem_pooled = (x_clamped.pow(self.p).mean(dim=1, keepdim=False)).pow(1./self.p) + return gem_pooled + + def cls_projection(self, x): + if isinstance(x, dict): + x = x['x_norm_clstoken'] + return x + + def seq_projection(self, x): + if isinstance(x, dict): + x = x['x_norm_patchtokens'] + return x # B x L + + def forward(self, pixel_values): + # DINOv3 forward pass + outputs = self.dino_model.forward_features(pixel_values) + last_hidden_states = outputs # B x L x h + embs = self.projection(last_hidden_states) + embs = self.adaptor(embs) return embs @@ -258,6 +345,14 @@ def ViTLarge_Embedder(**kwargs): def ViTHuge_Embedder(**kwargs): return ViTEmbedder(model_type="vit_huge", **kwargs) +def DINOv3Small_Embedder(**kwargs): + return DINOv3Embedder(model_type="dinov3_vits16", **kwargs) + +def DINOv3Base_Embedder(**kwargs): + return DINOv3Embedder(model_type="dinov3_vitb16", **kwargs) + +def DINOv3Large_Embedder(**kwargs): + return DINOv3Embedder(model_type="dinov3_vitl16", **kwargs) # def DINOv2Huge_Embedder(**kwargs): @@ -275,5 +370,8 @@ def ViTHuge_Embedder(**kwargs): 'dinov2-large': DINOv2Large_Embedder, 'vit-base': ViTBase_Embedder, 'vit-large': ViTLarge_Embedder, - 'vit-huge': ViTHuge_Embedder + 'vit-huge': ViTHuge_Embedder, + 'dinov3-small': DINOv3Small_Embedder, + 'dinov3-base': DINOv3Base_Embedder, + 'dinov3-large': DINOv3Large_Embedder, } \ No newline at end of file