Skip to content

Commit 81ca234

Browse files
authored
Add files via upload
1 parent 3f71b1a commit 81ca234

27 files changed

+6923
-0
lines changed

sam2/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
4+
# This source code is licensed under the license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
from hydra import initialize_config_module
8+
from hydra.core.global_hydra import GlobalHydra
9+
10+
if not GlobalHydra.instance().is_initialized():
11+
initialize_config_module("sam2", version_base="1.2")

sam2/automatic_mask_generator.py

Lines changed: 454 additions & 0 deletions
Large diffs are not rendered by default.

sam2/build_sam.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
4+
# This source code is licensed under the license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import logging
8+
import os
9+
10+
import torch
11+
from hydra import compose
12+
from hydra.utils import instantiate
13+
from omegaconf import OmegaConf
14+
15+
import sam2
16+
17+
# Check if the user is running Python from the parent directory of the sam2 repo
18+
# (i.e. the directory where this repo is cloned into) -- this is not supported since
19+
# it could shadow the sam2 package and cause issues.
20+
if os.path.isdir(os.path.join(sam2.__path__[0], "sam2")):
21+
# If the user has "sam2/sam2" in their path, they are likey importing the repo itself
22+
# as "sam2" rather than importing the "sam2" python package (i.e. "sam2/sam2" directory).
23+
# This typically happens because the user is running Python from the parent directory
24+
# that contains the sam2 repo they cloned.
25+
raise RuntimeError(
26+
"You're likely running Python from the parent directory of the sam2 repository "
27+
"(i.e. the directory where https://github.com/facebookresearch/sam2 is cloned into). "
28+
"This is not supported since the `sam2` Python package could be shadowed by the "
29+
"repository name (the repository is also named `sam2` and contains the Python package "
30+
"in `sam2/sam2`). Please run Python from another directory (e.g. from the repo dir "
31+
"rather than its parent dir, or from your home directory) after installing SAM 2."
32+
)
33+
34+
35+
HF_MODEL_ID_TO_FILENAMES = {
36+
"facebook/sam2-hiera-tiny": (
37+
"configs/sam2/sam2_hiera_t.yaml",
38+
"sam2_hiera_tiny.pt",
39+
),
40+
"facebook/sam2-hiera-small": (
41+
"configs/sam2/sam2_hiera_s.yaml",
42+
"sam2_hiera_small.pt",
43+
),
44+
"facebook/sam2-hiera-base-plus": (
45+
"configs/sam2/sam2_hiera_b+.yaml",
46+
"sam2_hiera_base_plus.pt",
47+
),
48+
"facebook/sam2-hiera-large": (
49+
"configs/sam2/sam2_hiera_l.yaml",
50+
"sam2_hiera_large.pt",
51+
),
52+
"facebook/sam2.1-hiera-tiny": (
53+
"configs/sam2.1/sam2.1_hiera_t.yaml",
54+
"sam2.1_hiera_tiny.pt",
55+
),
56+
"facebook/sam2.1-hiera-small": (
57+
"configs/sam2.1/sam2.1_hiera_s.yaml",
58+
"sam2.1_hiera_small.pt",
59+
),
60+
"facebook/sam2.1-hiera-base-plus": (
61+
"configs/sam2.1/sam2.1_hiera_b+.yaml",
62+
"sam2.1_hiera_base_plus.pt",
63+
),
64+
"facebook/sam2.1-hiera-large": (
65+
"configs/sam2.1/sam2.1_hiera_l.yaml",
66+
"sam2.1_hiera_large.pt",
67+
),
68+
}
69+
70+
71+
def build_sam2(
72+
config_file,
73+
ckpt_path=None,
74+
device="cuda",
75+
mode="eval",
76+
hydra_overrides_extra=[],
77+
apply_postprocessing=True,
78+
**kwargs,
79+
):
80+
81+
if apply_postprocessing:
82+
hydra_overrides_extra = hydra_overrides_extra.copy()
83+
hydra_overrides_extra += [
84+
# dynamically fall back to multi-mask if the single mask is not stable
85+
"++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
86+
"++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
87+
"++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
88+
]
89+
# Read config and init model
90+
cfg = compose(config_name=config_file, overrides=hydra_overrides_extra)
91+
OmegaConf.resolve(cfg)
92+
model = instantiate(cfg.model, _recursive_=True)
93+
_load_checkpoint(model, ckpt_path)
94+
model = model.to(device)
95+
if mode == "eval":
96+
model.eval()
97+
return model
98+
99+
100+
def build_sam2_video_predictor(
101+
config_file,
102+
ckpt_path=None,
103+
device="cuda",
104+
mode="eval",
105+
hydra_overrides_extra=[],
106+
apply_postprocessing=True,
107+
vos_optimized=False,
108+
**kwargs,
109+
):
110+
hydra_overrides = [
111+
"++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictor",
112+
]
113+
if vos_optimized:
114+
hydra_overrides = [
115+
"++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictorVOS",
116+
"++model.compile_image_encoder=True", # Let sam2_base handle this
117+
]
118+
119+
if apply_postprocessing:
120+
hydra_overrides_extra = hydra_overrides_extra.copy()
121+
hydra_overrides_extra += [
122+
# dynamically fall back to multi-mask if the single mask is not stable
123+
"++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
124+
"++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
125+
"++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
126+
# the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
127+
"++model.binarize_mask_from_pts_for_mem_enc=true",
128+
# fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
129+
"++model.fill_hole_area=8",
130+
]
131+
hydra_overrides.extend(hydra_overrides_extra)
132+
133+
# Read config and init model
134+
cfg = compose(config_name=config_file, overrides=hydra_overrides)
135+
OmegaConf.resolve(cfg)
136+
model = instantiate(cfg.model, _recursive_=True)
137+
_load_checkpoint(model, ckpt_path)
138+
model = model.to(device)
139+
if mode == "eval":
140+
model.eval()
141+
return model
142+
143+
144+
def _hf_download(model_id):
145+
from huggingface_hub import hf_hub_download
146+
147+
config_name, checkpoint_name = HF_MODEL_ID_TO_FILENAMES[model_id]
148+
ckpt_path = hf_hub_download(repo_id=model_id, filename=checkpoint_name)
149+
return config_name, ckpt_path
150+
151+
152+
def build_sam2_hf(model_id, **kwargs):
153+
config_name, ckpt_path = _hf_download(model_id)
154+
return build_sam2(config_file=config_name, ckpt_path=ckpt_path, **kwargs)
155+
156+
157+
def build_sam2_video_predictor_hf(model_id, **kwargs):
158+
config_name, ckpt_path = _hf_download(model_id)
159+
return build_sam2_video_predictor(
160+
config_file=config_name, ckpt_path=ckpt_path, **kwargs
161+
)
162+
163+
164+
def _load_checkpoint(model, ckpt_path):
165+
if ckpt_path is not None:
166+
sd = torch.load(ckpt_path, map_location="cpu", weights_only=True)["model"]
167+
missing_keys, unexpected_keys = model.load_state_dict(sd)
168+
if missing_keys:
169+
logging.error(missing_keys)
170+
raise RuntimeError()
171+
if unexpected_keys:
172+
logging.error(unexpected_keys)
173+
raise RuntimeError()
174+
logging.info("Loaded checkpoint sucessfully")
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# @package _global_
2+
3+
# Model
4+
model:
5+
_target_: sam2.modeling.sam2_base.SAM2Base
6+
image_encoder:
7+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8+
scalp: 1
9+
trunk:
10+
_target_: sam2.modeling.backbones.hieradet.Hiera
11+
embed_dim: 112
12+
num_heads: 2
13+
neck:
14+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
15+
position_encoding:
16+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
17+
num_pos_feats: 256
18+
normalize: true
19+
scale: null
20+
temperature: 10000
21+
d_model: 256
22+
backbone_channel_list: [896, 448, 224, 112]
23+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
24+
fpn_interp_model: nearest
25+
26+
memory_attention:
27+
_target_: sam2.modeling.memory_attention.MemoryAttention
28+
d_model: 256
29+
pos_enc_at_input: true
30+
layer:
31+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
32+
activation: relu
33+
dim_feedforward: 2048
34+
dropout: 0.1
35+
pos_enc_at_attn: false
36+
self_attention:
37+
_target_: sam2.modeling.sam.transformer.RoPEAttention
38+
rope_theta: 10000.0
39+
feat_sizes: [64, 64]
40+
embedding_dim: 256
41+
num_heads: 1
42+
downsample_rate: 1
43+
dropout: 0.1
44+
d_model: 256
45+
pos_enc_at_cross_attn_keys: true
46+
pos_enc_at_cross_attn_queries: false
47+
cross_attention:
48+
_target_: sam2.modeling.sam.transformer.RoPEAttention
49+
rope_theta: 10000.0
50+
feat_sizes: [64, 64]
51+
rope_k_repeat: True
52+
embedding_dim: 256
53+
num_heads: 1
54+
downsample_rate: 1
55+
dropout: 0.1
56+
kv_in_dim: 64
57+
num_layers: 4
58+
59+
memory_encoder:
60+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
61+
out_dim: 64
62+
position_encoding:
63+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
64+
num_pos_feats: 64
65+
normalize: true
66+
scale: null
67+
temperature: 10000
68+
mask_downsampler:
69+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
70+
kernel_size: 3
71+
stride: 2
72+
padding: 1
73+
fuser:
74+
_target_: sam2.modeling.memory_encoder.Fuser
75+
layer:
76+
_target_: sam2.modeling.memory_encoder.CXBlock
77+
dim: 256
78+
kernel_size: 7
79+
padding: 3
80+
layer_scale_init_value: 1e-6
81+
use_dwconv: True # depth-wise convs
82+
num_layers: 2
83+
84+
num_maskmem: 7
85+
image_size: 1024
86+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
87+
sigmoid_scale_for_mem_enc: 20.0
88+
sigmoid_bias_for_mem_enc: -10.0
89+
use_mask_input_as_output_without_sam: true
90+
# Memory
91+
directly_add_no_mem_embed: true
92+
no_obj_embed_spatial: true
93+
# use high-resolution feature map in the SAM mask decoder
94+
use_high_res_features_in_sam: true
95+
# output 3 masks on the first click on initial conditioning frames
96+
multimask_output_in_sam: true
97+
# SAM heads
98+
iou_prediction_use_sigmoid: True
99+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
100+
use_obj_ptrs_in_encoder: true
101+
add_tpos_enc_to_obj_ptrs: true
102+
proj_tpos_enc_in_obj_ptrs: true
103+
use_signed_tpos_enc_to_obj_ptrs: true
104+
only_obj_ptrs_in_the_past_for_eval: true
105+
# object occlusion prediction
106+
pred_obj_scores: true
107+
pred_obj_scores_mlp: true
108+
fixed_no_obj_ptr: true
109+
# multimask tracking settings
110+
multimask_output_for_tracking: true
111+
use_multimask_token_for_obj_ptr: true
112+
multimask_min_pt_num: 0
113+
multimask_max_pt_num: 1
114+
use_mlp_for_obj_ptr_proj: true
115+
# Compilation flag
116+
compile_image_encoder: False

0 commit comments

Comments
 (0)