Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion arkit_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ def __getitem__(self, idx):
if self.use_gdino:
pred_box, pred_label = self.get_pred_det(frame_id)
data['pred_bbox'] = pred_box
data['pred_label'] = pred_label

# images: 2 x H x W x 3
# detections: 2 x N x 4, N can be not the same
Expand Down Expand Up @@ -339,7 +340,7 @@ def arkit_collate_fn(batch):

ret['pred_bbox'] = batch_pred_bbox
ret['pred_bbox_mask'] = batch_pred_bbox_mask

ret['pred_label'] = pad_sequence([item['pred_label'] for item in batch], batch_first=True, padding_value=-1)
return ret


Expand Down
6 changes: 3 additions & 3 deletions configs/experiments/aomsg.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# sample style training script
dataset_path: './data/msg'
device: 1
device: 0
pp_threshold: 0.3
object_threshold: 0.2

Expand All @@ -22,8 +22,8 @@ wandb: True
resume: False
resume_path: './path/to/checkpoints/0+.pth'
# for eval
eval_output_dir: './exp-results/aomsg/2024-05-14_22-26-52' # this is specific to trained checkpoints
eval_chkpt: 29-step22470+.pth #null for no checkpoint
eval_output_dir: './exp-results/aomsg' # this is specific to trained checkpoints
eval_chkpt: aomsg-s-4.pth #null for no checkpoint
save_every: True # if save specific results for every video
eval_step: 100

Expand Down
4 changes: 2 additions & 2 deletions configs/experiments/aomsg_gdino.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ loss_params:
eval_split: Test
train_split: Training

output_dir: './exp-results/gdino'
output_dir: './exp-results/aomsg-gdino'
output_file: 'train'
wandb: False

Expand All @@ -23,7 +23,7 @@ resume: False
resume_path: './path/to/checkpoints/0+.pth'

eval_output_dir: './exp-results/emb_ckpt' # this is specific to trained checkpoints
eval_chkpt: aomsg_s_4.pth #29-step44940+.pth #null for no checkpoint
eval_chkpt: aomsg-s-4.pth #29-step44940+.pth #null for no checkpoint
save_every: True # if save specific results for every video
eval_step: 100

Expand Down
41 changes: 41 additions & 0 deletions configs/experiments/direct_dinobase.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
dataset_path: './data/msg'
device: 0
pp_threshold: 0.5
object_threshold: 0.3

eval_split: Test
# train_split: Training

output_dir: './exp-results/direct-dinov3-base'
output_file: 'direct-dinov3-base'
# wandb: True

resume: False

eval_output_dir: './exp-results/direct-dinov3-base'
eval_chkpt: null

save_every: True # if save specific results for every video

num_workers: 4
# train_bs: 32
eval_bs: 64 # debug test


obj_embedder:
model: "dinov3-base" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50'
weights: DEFAULT
freeze: True
output_type: mean

place_embedder:
model: "dinov3-base" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base"
weights: DEFAULT
freeze: True
output_type: cls # mean, cls, feature, vec

associator:
model: "SepMSG-direct" # "AoMSG-S-2"
object_dim: 384 # FYI dinov2-small 384, dinov2-base 768
place_dim: 384
output_dim: null
41 changes: 41 additions & 0 deletions configs/experiments/direct_dinolarge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
dataset_path: './data/msg'
device: 0
pp_threshold: 0.5
object_threshold: 0.3

eval_split: Test
# train_split: Training

output_dir: './exp-results/direct-dinov3-large'
output_file: 'direct-dinov3-large'
# wandb: True

resume: False

eval_output_dir: './exp-results/direct-dinov3-large'
eval_chkpt: null

save_every: True # if save specific results for every video

num_workers: 4
# train_bs: 32
eval_bs: 64 # debug test


obj_embedder:
model: "dinov3-large" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50'
weights: DEFAULT
freeze: True
output_type: mean

place_embedder:
model: "dinov3-large" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base"
weights: DEFAULT
freeze: True
output_type: cls # mean, cls, feature, vec

associator:
model: "SepMSG-direct" # "AoMSG-S-2"
object_dim: 384 # FYI dinov2-small 384, dinov2-base 768
place_dim: 384
output_dim: null
41 changes: 41 additions & 0 deletions configs/experiments/direct_dinosmall.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
dataset_path: './data/msg'
device: 0
pp_threshold: 0.5
object_threshold: 0.3

eval_split: Test
# train_split: Training

output_dir: './exp-results/direct-dinov3-small'
output_file: 'direct-dinov3-small'
# wandb: True

resume: False

eval_output_dir: './exp-results/direct-dinov3-small'
eval_chkpt: null

save_every: True # if save specific results for every video

num_workers: 4
# train_bs: 32
eval_bs: 64 # debug test


obj_embedder:
model: "dinov3-small" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50'
weights: DEFAULT
freeze: True
output_type: mean

place_embedder:
model: "dinov3-small" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base"
weights: DEFAULT
freeze: True
output_type: cls # mean, cls, feature, vec

associator:
model: "SepMSG-direct" # "AoMSG-S-2"
object_dim: 384 # FYI dinov2-small 384, dinov2-base 768
place_dim: 384
output_dim: null
49 changes: 49 additions & 0 deletions configs/experiments/direct_gdino.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
dataset_path: './data/msg'
device: 0
pp_threshold: 0.5
object_threshold: 0.3

eval_split: Test
# train_split: Training

output_dir: './exp-results/direct-gdino'
output_file: 'direct-gdino'
# wandb: True

resume: False

eval_output_dir: './exp-results/direct-gdino'
eval_chkpt: null

save_every: True # if save specific results for every video

num_workers: 4
# train_bs: 32
eval_bs: 64 # debug test

#direct-grounding-dino
detector:
model: grounding-dino # "grounding-dino", "fasterrcnn", "gt", "fasterrcnnv2"
num_classes: 18
freeze: True
weights: DEFAULT
pre_saved: True
result_path: "./exp-results/gdino-direct"

obj_embedder:
model: "dinov2-small" #"dinov2-small", "dinov2-base", "convnext-tiny-224", #'resnet50'
weights: DEFAULT
freeze: True
output_type: mean

place_embedder:
model: "dinov2-small" #"convnext-tiny", #'resnet50', # "dinov2-small", # "dinov2-base"
weights: DEFAULT
freeze: True
output_type: cls # mean, cls, feature, vec

associator:
model: "SepMSG-direct" # "AoMSG-S-2"
object_dim: 384 # FYI dinov2-small 384, dinov2-base 768
place_dim: 384
output_dim: null
2 changes: 1 addition & 1 deletion eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def eval_per_video(dataset, dataloader, config, mapper, model, device, backproc,

with torch.no_grad():
for batch in tqdm(dataloader):
# print(batch)
images = batch['image'].to(device)
# potentially pass more information to the model
additional_info = {
Expand All @@ -85,6 +84,7 @@ def eval_per_video(dataset, dataloader, config, mapper, model, device, backproc,
if 'pred_bbox' in batch:
additional_info['pred_bbox'] = batch['pred_bbox'].to(device)
additional_info['pred_bbox_mask'] = batch['pred_bbox_mask'].to(device)
additional_info['pred_label'] = batch['pred_label'].to(device)

results = model(images, additional_info)

Expand Down
102 changes: 100 additions & 2 deletions models/encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from torchvision.models import resnet50, resnet18, convnext_tiny, convnext_small, convnext_base
# from transformers import AutoImageProcessor, ConvNextModel
from transformers import Dinov2Model, ViTModel
from transformers import Dinov2Model, ViTModel, AutoModel, AutoImageProcessor


class ResNetEmbedder(nn.Module):
Expand Down Expand Up @@ -218,6 +218,93 @@ def forward(self, pixel_values):
# print("dino emb", embs.size())
# embs = last_hidden_states[:, 1:, :].mean(dim=1) # B x h, NOTE: its an average over all image tokens
embs = self.adaptor(embs)
return embs


class DINOv3Embedder(nn.Module):
# output type: cls or mean of the tokens, or perhaps tokens?
def __init__(self, model_type, output_type='mean', freeze=True, weights="DEFAULT"):
super().__init__()
repo_path = '/home/zw4269/dev/dinov3'

# DINOv3 ViT models pretrained on web images
dinov3_vits16 = torch.hub.load(repo_path, 'dinov3_vits16', source='local', weights='./exp-results/emb_ckpt/dino3/dinov3_vits16_pretrain_lvd1689m-08c60483.pth')
dinov3_vitb16 = torch.hub.load(repo_path, 'dinov3_vitb16', source='local', weights='./exp-results/emb_ckpt/dino3/dinov3_vitb16_pretrain_lvd1689m-73cec8be.pth')
dinov3_vitl16 = torch.hub.load(repo_path, 'dinov3_vitl16', source='local', weights='./exp-results/emb_ckpt/dino3/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth')

if model_type == "dinov3_vits16":
self.dino_model = dinov3_vits16
elif model_type == "dinov3_vitb16":
self.dino_model = dinov3_vitb16
elif model_type == "dinov3_vitl16":
self.dino_model = dinov3_vitl16
else:
raise NotImplementedError(model_type)

if freeze:
for param in self.dino_model.parameters():
param.requires_grad = False
self.dino_model.eval()

# Get feature dimension from the model
self.feature_dim = self.dino_model.embed_dim
self.adaptor = nn.Identity()

if output_type == 'mean':
print("use mean")
self.projection = self.mean_projection
elif output_type == 'cls':
self.projection = self.cls_projection
elif output_type == 'feature':
self.projection = self.seq_projection
elif output_type == 'max':
print("use max")
self.projection = self.max_projection
elif output_type.startswith('gem'): # for example: gem_3
print("use", output_type)
p = float(int(output_type.split('_')[1]))
self.projection = self.gem_projection
self.p = nn.Parameter(torch.ones(1) * p)
if freeze:
self.p.requires_grad = False
else:
raise NotImplementedError

def mean_projection(self, x):
if isinstance(x, dict):
x = x['x_norm_patchtokens']
return x.mean(dim=1)

def max_projection(self, x):
if isinstance(x, dict):
x = x['x_norm_patchtokens']
x_bhl = x.permute(0, 2, 1)
xp_bh = F.adaptive_max_pool1d(x_bhl, output_size=1).squeeze(2)
return xp_bh

def gem_projection(self, x, eps=1e-6):
if isinstance(x, dict):
x = x['x_norm_patchtokens']
x_clamped = F.relu(x).clamp(min=eps)
gem_pooled = (x_clamped.pow(self.p).mean(dim=1, keepdim=False)).pow(1./self.p)
return gem_pooled

def cls_projection(self, x):
if isinstance(x, dict):
x = x['x_norm_clstoken']
return x

def seq_projection(self, x):
if isinstance(x, dict):
x = x['x_norm_patchtokens']
return x # B x L

def forward(self, pixel_values):
# DINOv3 forward pass
outputs = self.dino_model.forward_features(pixel_values)
last_hidden_states = outputs # B x L x h
embs = self.projection(last_hidden_states)
embs = self.adaptor(embs)
return embs


Expand Down Expand Up @@ -258,6 +345,14 @@ def ViTLarge_Embedder(**kwargs):
def ViTHuge_Embedder(**kwargs):
return ViTEmbedder(model_type="vit_huge", **kwargs)

def DINOv3Small_Embedder(**kwargs):
return DINOv3Embedder(model_type="dinov3_vits16", **kwargs)

def DINOv3Base_Embedder(**kwargs):
return DINOv3Embedder(model_type="dinov3_vitb16", **kwargs)

def DINOv3Large_Embedder(**kwargs):
return DINOv3Embedder(model_type="dinov3_vitl16", **kwargs)


# def DINOv2Huge_Embedder(**kwargs):
Expand All @@ -275,5 +370,8 @@ def ViTHuge_Embedder(**kwargs):
'dinov2-large': DINOv2Large_Embedder,
'vit-base': ViTBase_Embedder,
'vit-large': ViTLarge_Embedder,
'vit-huge': ViTHuge_Embedder
'vit-huge': ViTHuge_Embedder,
'dinov3-small': DINOv3Small_Embedder,
'dinov3-base': DINOv3Base_Embedder,
'dinov3-large': DINOv3Large_Embedder,
}