-
Notifications
You must be signed in to change notification settings - Fork 755
Description
Hi, I am training dinov3s from scratch on a private dataset and the logs seems fine, however it's saving sharded checkpoints which for some reason I am not able to load, also as per my understanding it should be saving teacher checpoint, but it just saves these. Can anyone clarify this please?
`MODEL:
META_ARCHITECTURE: SSLMetaArch
DEVICE: cuda
WEIGHTS: ''
DTYPE: float32
compute_precision:
param_dtype: bf16
reduce_dtype: fp32
sharding_strategy: SHARD_GRAD_OP
dino:
loss_weight: 1.0
global_ignore_diagonal: true
head_n_prototypes: 262144
head_bottleneck_dim: 512
head_norm_last_layer: false
head_nlayers: 3
head_hidden_dim: 8192
koleo_loss_weight: 0.1
koleo_loss_distributed: false
koleo_topk: 1
koleo_distributed_replicas: 0
koleo_distributed_loss_group_size: null
force_weight_norm: false
ibot:
loss_weight: 1.0
mask_sample_probability: 0.5
mask_ratio_min_max:
- 0.1
- 0.5
mask_random_circular_shift: false
force_masking_even_with_zero_weight: false
separate_head: true
head_n_prototypes: 98304
head_bottleneck_dim: 384
head_norm_last_layer: false
head_nlayers: 3
head_hidden_dim: 384
gram:
use_loss: false
compute_stats: false
train:
batch_size_per_gpu: 64
dataset_path: null
saveckp_freq: 20
seed: 0
num_workers: 10
OFFICIAL_EPOCH_LENGTH: 1000
monitor_gradient_norm: false
chunk_schedule: []
cache_dataset: true
use_teacher_head: true
learn_from_teacher_tokens: false
centering: sinkhorn_knopp
checkpointing: true
checkpointing_full: true
compile: true
cudagraphs: false
cell_augmentation: false
cell_augmentation_type: hpa
sharded_eval_checkpoint: false
student:
arch: vit_small
patch_size: 16
drop_path_rate: 0.4
layerscale: 1.0e-05
patch_drop: 0.0
pretrained_weights: ''
ffn_layer: swiglu64
ffn_ratio: 3
resume_from_teacher_chkpt: ''
qkv_bias: false
proj_bias: true
ffn_bias: true
norm_layer: layernormbf16
n_storage_tokens: 4
untie_cls_and_patch_norms: false
untie_global_and_local_cls_norm: true
mask_k_bias: true
in_chans: 3
pos_embed_type: rope
pos_embed_rope_base: 100
pos_embed_rope_min_period: null
pos_embed_rope_max_period: null
pos_embed_rope_normalize_coords: separate
pos_embed_rope_shift_coords: null
pos_embed_rope_jitter_coords: null
pos_embed_rope_rescale_coords: 2
pos_embed_rope_dtype: fp32
fp8_enabled: false
fp8_filter: blocks
teacher:
momentum_teacher: null
final_momentum_teacher: null
warmup_teacher_temp: null
teacher_temp: null
warmup_teacher_temp_epochs: null
in_chans: 3
distillation:
enabled: false
full_cfg_path: ''
checkpoint_path: ''
multidistillation:
enabled: false
hrft:
enabled: false
checkpoint_path: ''
optim:
epochs: 100
optimizer: adamw
weight_decay: null
weight_decay_end: null
lr: null
warmup_epochs: null
min_lr: null
schedule_trunc_extra: null
clip_grad: 30.0
freeze_last_layer_epochs: null
scaling_rule: sqrt_wrt_1024
patch_embed_lr_mult: 0.2
dino_head_wd_multiplier: 1.0
layerwise_decay: 0.98
multi_tensor_optim: true
dump_fsdp_weights_path: ''
adamw_beta1: 0.9
adamw_beta2: 0.99
crops:
global_crops_scale: - 0.32
- 1.0
local_crops_number: 8
local_crops_scale: - 0.05
- 0.32
global_crops_size: 256
local_crops_size: 112
localcrops_subset_of_globalcrops: false
share_color_jitter: false
horizontal_flips: false
rgb_mean: - 0.485
- 0.456
- 0.406
rgb_std: - 0.229
- 0.224
- 0.225
checkpointing:
period: 10
max_to_keep: 3
keep_every: 5000
schedules:
lr:
start: 0
peak: 5.0e-05
end: 5.0e-05
warmup_epochs: 100
freeze_last_layer_epochs: 5
weight_decay:
start: 0.04
peak: 0.04
end: 0.04
warmup_epochs: 0
teacher_temp:
start: 0.04
peak: 0.07
end: 0.07
warmup_epochs: 100
momentum:
start: 0.994
peak: 0.994
end: 0.994
warmup_epochs: 0
`