ai4ce · ZehongW07 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/arkit_dataset.py b/arkit_dataset.py
@@ -196,6 +196,7 @@ def __getitem__(self, idx):
         if self.use_gdino:
             pred_box, pred_label = self.get_pred_det(frame_id)
             data['pred_bbox'] = pred_box
+            data['pred_label'] = pred_label
 
         # images: 2 x H x W x 3
         # detections: 2 x N x 4, N can be not the same
@@ -339,7 +340,7 @@ def arkit_collate_fn(batch):
 
         ret['pred_bbox'] = batch_pred_bbox
         ret['pred_bbox_mask'] = batch_pred_bbox_mask
-
+        ret['pred_label'] = pad_sequence([item['pred_label'] for item in batch], batch_first=True, padding_value=-1)
     return ret
 
 

diff --git a/configs/experiments/aomsg.yaml b/configs/experiments/aomsg.yaml
@@ -1,6 +1,6 @@
 # sample style training script
 dataset_path: './data/msg'
-device: 1
+device: 0
 pp_threshold: 0.3
 object_threshold: 0.2
 
@@ -22,8 +22,8 @@ wandb: True
 resume: False
 resume_path: './path/to/checkpoints/0+.pth'
 # for eval
-eval_output_dir: './exp-results/aomsg/2024-05-14_22-26-52' # this is specific to trained checkpoints
-eval_chkpt: 29-step22470+.pth #null for no checkpoint
+eval_output_dir: './exp-results/aomsg' # this is specific to trained checkpoints
+eval_chkpt: aomsg-s-4.pth #null for no checkpoint
 save_every: True # if save specific results for every video
 eval_step: 100
 

diff --git a/configs/experiments/aomsg_gdino.yaml b/configs/experiments/aomsg_gdino.yaml
@@ -14,7 +14,7 @@ loss_params:
 eval_split: Test
 train_split: Training
 
-output_dir: './exp-results/gdino'
+output_dir: './exp-results/aomsg-gdino'
 output_file: 'train'
 wandb: False
 
@@ -23,7 +23,7 @@ resume: False
 resume_path: './path/to/checkpoints/0+.pth'
 
 eval_output_dir: './exp-results/emb_ckpt' # this is specific to trained checkpoints
-eval_chkpt: aomsg_s_4.pth #29-step44940+.pth #null for no checkpoint
+eval_chkpt: aomsg-s-4.pth #29-step44940+.pth #null for no checkpoint
 save_every: True # if save specific results for every video
 eval_step: 100
 

diff --git a/configs/experiments/direct_dinobase.yaml b/configs/experiments/direct_dinobase.yaml
@@ -0,0 +1,41 @@
+dataset_path: './data/msg' 
+device: 0
+pp_threshold: 0.5
+object_threshold: 0.3
+
+eval_split: Test
+# train_split: Training
+
+output_dir: './exp-results/direct-dinov3-base'
+output_file: 'direct-dinov3-base'
+# wandb: True
+
+resume: False
+
+eval_output_dir: './exp-results/direct-dinov3-base'
+eval_chkpt: null
+
+save_every: True # if save specific results for every video
+
+num_workers: 4
+# train_bs: 32
+eval_bs: 64 # debug test
+
+
+obj_embedder:
+  model: "dinov3-base" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50'
+  weights: DEFAULT
+  freeze: True
+  output_type: mean
+
+place_embedder:
+  model: "dinov3-base" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base"
+  weights: DEFAULT
+  freeze: True
+  output_type: cls # mean, cls, feature, vec
+
+associator:
+  model: "SepMSG-direct" # "AoMSG-S-2"
+  object_dim: 384 # FYI dinov2-small 384, dinov2-base 768
+  place_dim: 384
+  output_dim: null
diff --git a/configs/experiments/direct_dinolarge.yaml b/configs/experiments/direct_dinolarge.yaml
@@ -0,0 +1,41 @@
+dataset_path: './data/msg' 
+device: 0
+pp_threshold: 0.5
+object_threshold: 0.3
+
+eval_split: Test
+# train_split: Training
+
+output_dir: './exp-results/direct-dinov3-large'
+output_file: 'direct-dinov3-large'
+# wandb: True
+
+resume: False
+
+eval_output_dir: './exp-results/direct-dinov3-large'
+eval_chkpt: null
+
+save_every: True # if save specific results for every video
+
+num_workers: 4
+# train_bs: 32
+eval_bs: 64 # debug test
+
+
+obj_embedder:
+  model: "dinov3-large" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50'
+  weights: DEFAULT
+  freeze: True
+  output_type: mean
+
+place_embedder:
+  model: "dinov3-large" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base"
+  weights: DEFAULT
+  freeze: True
+  output_type: cls # mean, cls, feature, vec
+
+associator:
+  model: "SepMSG-direct" # "AoMSG-S-2"
+  object_dim: 384 # FYI dinov2-small 384, dinov2-base 768
+  place_dim: 384
+  output_dim: null
diff --git a/configs/experiments/direct_dinosmall.yaml b/configs/experiments/direct_dinosmall.yaml
@@ -0,0 +1,41 @@
+dataset_path: './data/msg' 
+device: 0
+pp_threshold: 0.5
+object_threshold: 0.3
+
+eval_split: Test
+# train_split: Training
+
+output_dir: './exp-results/direct-dinov3-small'
+output_file: 'direct-dinov3-small'
+# wandb: True
+
+resume: False
+
+eval_output_dir: './exp-results/direct-dinov3-small'
+eval_chkpt: null
+
+save_every: True # if save specific results for every video
+
+num_workers: 4
+# train_bs: 32
+eval_bs: 64 # debug test
+
+
+obj_embedder:
+  model: "dinov3-small" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50'
+  weights: DEFAULT
+  freeze: True
+  output_type: mean
+
+place_embedder:
+  model: "dinov3-small" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base"
+  weights: DEFAULT
+  freeze: True
+  output_type: cls # mean, cls, feature, vec
+
+associator:
+  model: "SepMSG-direct" # "AoMSG-S-2"
+  object_dim: 384 # FYI dinov2-small 384, dinov2-base 768
+  place_dim: 384
+  output_dim: null
diff --git a/configs/experiments/direct_gdino.yaml b/configs/experiments/direct_gdino.yaml
@@ -0,0 +1,49 @@
+dataset_path: './data/msg' 
+device: 0
+pp_threshold: 0.5
+object_threshold: 0.3
+
+eval_split: Test
+# train_split: Training
+
+output_dir: './exp-results/direct-gdino'
+output_file: 'direct-gdino'
+# wandb: True
+
+resume: False
+
+eval_output_dir: './exp-results/direct-gdino'
+eval_chkpt: null
+
+save_every: True # if save specific results for every video
+
+num_workers: 4
+# train_bs: 32
+eval_bs: 64 # debug test
+
+#direct-grounding-dino
+detector:
+  model: grounding-dino # "grounding-dino", "fasterrcnn", "gt", "fasterrcnnv2"
+  num_classes: 18
+  freeze: True
+  weights: DEFAULT
+  pre_saved: True
+  result_path: "./exp-results/gdino-direct"
+
+obj_embedder:
+  model: "dinov2-small" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50'
+  weights: DEFAULT
+  freeze: True
+  output_type: mean
+
+place_embedder:
+  model: "dinov2-small" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base"
+  weights: DEFAULT
+  freeze: True
+  output_type: cls # mean, cls, feature, vec
+
+associator:
+  model: "SepMSG-direct" # "AoMSG-S-2"
+  object_dim: 384 # FYI dinov2-small 384, dinov2-base 768
+  place_dim: 384
+  output_dim: null
diff --git a/eval.py b/eval.py
@@ -69,7 +69,6 @@ def eval_per_video(dataset, dataloader, config, mapper, model, device, backproc,
 
     with torch.no_grad():
         for batch in tqdm(dataloader):
-            # print(batch)
             images = batch['image'].to(device)
             # potentially pass more information to the model
             additional_info = {
@@ -85,6 +84,7 @@ def eval_per_video(dataset, dataloader, config, mapper, model, device, backproc,
             if 'pred_bbox' in batch:
                 additional_info['pred_bbox'] = batch['pred_bbox'].to(device)
                 additional_info['pred_bbox_mask'] = batch['pred_bbox_mask'].to(device)
+                additional_info['pred_label'] = batch['pred_label'].to(device)
 
             results = model(images, additional_info)
 

diff --git a/models/encoders.py b/models/encoders.py
@@ -6,7 +6,7 @@
 
 from torchvision.models import resnet50, resnet18, convnext_tiny, convnext_small, convnext_base
 # from transformers import AutoImageProcessor, ConvNextModel
-from transformers import Dinov2Model, ViTModel
+from transformers import Dinov2Model, ViTModel, AutoModel, AutoImageProcessor
 
 
 class ResNetEmbedder(nn.Module):
@@ -218,6 +218,93 @@ def forward(self, pixel_values):
         # print("dino emb", embs.size())
         # embs = last_hidden_states[:, 1:, :].mean(dim=1) # B x h, NOTE: its an average over all image tokens
         embs = self.adaptor(embs)
+        return embs
+
+
+class DINOv3Embedder(nn.Module):
+    # output type: cls or mean of the tokens, or perhaps tokens?
+    def __init__(self, model_type, output_type='mean', freeze=True, weights="DEFAULT"):
+        super().__init__()
+        repo_path = '/home/zw4269/dev/dinov3'
+
+        # DINOv3 ViT models pretrained on web images
+        dinov3_vits16 = torch.hub.load(repo_path, 'dinov3_vits16', source='local', weights='./exp-results/emb_ckpt/dino3/dinov3_vits16_pretrain_lvd1689m-08c60483.pth')
+        dinov3_vitb16 = torch.hub.load(repo_path, 'dinov3_vitb16', source='local', weights='./exp-results/emb_ckpt/dino3/dinov3_vitb16_pretrain_lvd1689m-73cec8be.pth')
+        dinov3_vitl16 = torch.hub.load(repo_path, 'dinov3_vitl16', source='local', weights='./exp-results/emb_ckpt/dino3/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth')
+
+        if model_type == "dinov3_vits16":
+            self.dino_model = dinov3_vits16
+        elif model_type == "dinov3_vitb16":
+            self.dino_model = dinov3_vitb16
+        elif model_type == "dinov3_vitl16":
+            self.dino_model = dinov3_vitl16
+        else:
+            raise NotImplementedError(model_type)
+
+        if freeze:
+            for param in self.dino_model.parameters():
+                param.requires_grad = False
+            self.dino_model.eval()
+
+        # Get feature dimension from the model
+        self.feature_dim = self.dino_model.embed_dim
+        self.adaptor = nn.Identity()
+
+        if output_type == 'mean':
+            print("use mean")
+            self.projection = self.mean_projection
+        elif output_type == 'cls':
+            self.projection = self.cls_projection
+        elif output_type == 'feature':
+            self.projection = self.seq_projection
+        elif output_type == 'max':
+            print("use max")
+            self.projection = self.max_projection
+        elif output_type.startswith('gem'): # for example: gem_3
+            print("use", output_type)
+            p = float(int(output_type.split('_')[1]))
+            self.projection = self.gem_projection
+            self.p = nn.Parameter(torch.ones(1) * p)
+            if freeze:
+                self.p.requires_grad = False
+        else:
+            raise NotImplementedError
+
+    def mean_projection(self, x):
+        if isinstance(x, dict):
+            x = x['x_norm_patchtokens']
+        return x.mean(dim=1)
+
+    def max_projection(self, x):
+        if isinstance(x, dict):
+            x = x['x_norm_patchtokens']
+        x_bhl = x.permute(0, 2, 1)
+        xp_bh = F.adaptive_max_pool1d(x_bhl, output_size=1).squeeze(2)
+        return xp_bh
+
+    def gem_projection(self, x, eps=1e-6):
+        if isinstance(x, dict):
+            x = x['x_norm_patchtokens']
+        x_clamped = F.relu(x).clamp(min=eps)
+        gem_pooled = (x_clamped.pow(self.p).mean(dim=1, keepdim=False)).pow(1./self.p)
+        return gem_pooled        
+
+    def cls_projection(self, x):
+        if isinstance(x, dict):
+            x = x['x_norm_clstoken']
+        return x
+
+    def seq_projection(self, x):
+        if isinstance(x, dict):
+            x = x['x_norm_patchtokens']
+        return x # B x L
+
+    def forward(self, pixel_values):
+        # DINOv3 forward pass
+        outputs = self.dino_model.forward_features(pixel_values)
+        last_hidden_states = outputs  # B x L x h
+        embs = self.projection(last_hidden_states)
+        embs = self.adaptor(embs)
         return embs        
 
 
@@ -258,6 +345,14 @@ def ViTLarge_Embedder(**kwargs):
 def ViTHuge_Embedder(**kwargs):
     return ViTEmbedder(model_type="vit_huge", **kwargs)
 
+def DINOv3Small_Embedder(**kwargs):
+    return DINOv3Embedder(model_type="dinov3_vits16", **kwargs)
+
+def DINOv3Base_Embedder(**kwargs):
+    return DINOv3Embedder(model_type="dinov3_vitb16", **kwargs)
+
+def DINOv3Large_Embedder(**kwargs):
+    return DINOv3Embedder(model_type="dinov3_vitl16", **kwargs)
 
 
 # def DINOv2Huge_Embedder(**kwargs):
@@ -275,5 +370,8 @@ def ViTHuge_Embedder(**kwargs):
     'dinov2-large': DINOv2Large_Embedder,
     'vit-base': ViTBase_Embedder,
     'vit-large': ViTLarge_Embedder,
-    'vit-huge': ViTHuge_Embedder
+    'vit-huge': ViTHuge_Embedder,
+    'dinov3-small': DINOv3Small_Embedder,
+    'dinov3-base': DINOv3Base_Embedder,
+    'dinov3-large': DINOv3Large_Embedder,
 }