Skip to content

Commit 321da70

Browse files
committed
release training code
1 parent 584123d commit 321da70

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+4028
-227
lines changed

.amltconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"project_name": "vldetection", "storage_account_name": "vldetection", "container_name": "amulet", "blob_storage_account_name": "vldetection", "registry_name": "projects", "local_path": "/home/t-xzou/xdecoder_code_release/X-Decoder", "default_output_dir": "/home/t-xzou/xdecoder_code_release/X-Decoder/amlt", "project_uuid": "7355623420.53964-814624d9-7249-4fdc-a216-720c60847ba0", "version": "9.9.2"}

.gitignore

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,9 @@ kill.sh
9999
draws/
100100
plot/
101101

102-
103-
102+
*run.sh
103+
exps/*
104+
amlt/*
104105

105106
*venv/*
106107
*.pt
Lines changed: 378 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,378 @@
1+
# --------------------------------------------------------
2+
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3+
# Copyright (c) 2022 Microsoft
4+
# Licensed under The MIT License [see LICENSE for details]
5+
# Written by Xueyan Zou (xueyan@cs.wisc.edu)
6+
# --------------------------------------------------------
7+
8+
# Define Test/Trainer/Saving
9+
PIPELINE: XDecoderPipeline
10+
TRAINER: xdecoder
11+
SAVE_DIR: '../../data/output/test'
12+
base_path: "./" # lower case due to it is used in mainz as such
13+
14+
# Resume Logistic
15+
RESUME: false
16+
WEIGHT: false
17+
RESET_DATA_LOADER: false
18+
RESUME_FROM: ''
19+
PYLEARN_MODEL: '' # model resume when evaluation
20+
DONT_LOAD_MODEL: true
21+
22+
# Logging and Debug
23+
LOG_EVERY: 500
24+
FIND_UNUSED_PARAMETERS: false
25+
26+
# Speed up training
27+
FP16: false
28+
PORT: '36873'
29+
30+
# misc
31+
LOADER:
32+
JOINT: True
33+
KEY_DATASET: 'coco'
34+
35+
##################
36+
# Task settings
37+
##################
38+
VERBOSE: true
39+
MODEL:
40+
NAME: xdecoder_model
41+
HEAD: xdecoder_head
42+
MASK_ON: false
43+
KEYPOINT_ON: false
44+
LOAD_PROPOSALS: false
45+
DIM_PROJ: 512
46+
BACKBONE_DIM: 1536
47+
TEXT:
48+
ARCH: vlpencoder
49+
NAME: transformer
50+
TOKENIZER: clip
51+
CONTEXT_LENGTH: 77 # 77
52+
WIDTH: 512
53+
HEADS: 8
54+
LAYERS: 12 # 6
55+
AUTOGRESSIVE: True
56+
BACKBONE:
57+
NAME: focal
58+
PRETRAINED: ''
59+
LOAD_PRETRAINED: false
60+
FOCAL:
61+
PRETRAIN_IMG_SIZE: 224
62+
PATCH_SIZE: 4
63+
EMBED_DIM: 192
64+
DEPTHS: [2, 2, 18, 2]
65+
FOCAL_LEVELS: [4, 4, 4, 4]
66+
FOCAL_WINDOWS: [3, 3, 3, 3]
67+
DROP_PATH_RATE: 0.3
68+
MLP_RATIO: 4.0
69+
DROP_RATE: 0.0
70+
PATCH_NORM: True
71+
USE_CONV_EMBED: True
72+
SCALING_MODULATOR: True
73+
USE_CHECKPOINT: False
74+
USE_POSTLN: true
75+
USE_POSTLN_IN_MODULATION: false
76+
USE_LAYERSCALE: True
77+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
78+
OUT_INDICES: [0, 1, 2, 3]
79+
ENCODER:
80+
NAME: transformer_encoder_fpn
81+
IGNORE_VALUE: 255
82+
NUM_CLASSES: 133
83+
LOSS_WEIGHT: 1.0
84+
CONVS_DIM: 512
85+
MASK_DIM: 512
86+
NORM: "GN"
87+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
88+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
89+
COMMON_STRIDE: 4
90+
TRANSFORMER_ENC_LAYERS: 6
91+
DECODER:
92+
NAME: xdecoder
93+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
94+
MASK: True
95+
GROUNDING:
96+
ENABLED: True
97+
MAX_LEN: 5
98+
TEXT_WEIGHT: 2.0
99+
CLASS_WEIGHT: 0.5
100+
DETECTION: False
101+
CAPTION:
102+
ENABLED: True
103+
PHRASE_PROB: 0.0
104+
SIM_THRES: 0.95
105+
CAPTIONING:
106+
ENABLED: True
107+
STEP: 50
108+
RETRIEVAL:
109+
ENABLED: True
110+
DIM_IMG: 1024
111+
ENSEMBLE: True
112+
DEEP_SUPERVISION: True
113+
NO_OBJECT_WEIGHT: 0.1
114+
CAPTION_WEIGHT: 1.0
115+
CAPTIONING_WEIGHT: 2.0
116+
RETRIEVAL_WEIGHT: 2.0
117+
BACKBONER_WEIGHT: 8.0
118+
GCLASS_WEIGHT: 0.4
119+
GMASK_WEIGHT: 1.0
120+
GDICE_WEIGHT: 1.0
121+
OCLASS_WEIGHT: 0.4
122+
OMASK_WEIGHT: 1.0
123+
ODICE_WEIGHT: 1.0
124+
CLASS_WEIGHT: 2.0
125+
MASK_WEIGHT: 5.0
126+
DICE_WEIGHT: 5.0
127+
BBOX_WEIGHT: 5.0
128+
GIOU_WEIGHT: 2.0
129+
HIDDEN_DIM: 512
130+
NUM_OBJECT_QUERIES: 101
131+
NHEADS: 8
132+
DROPOUT: 0.0
133+
DIM_FEEDFORWARD: 2048
134+
PRE_NORM: False
135+
ENFORCE_INPUT_PROJ: False
136+
SIZE_DIVISIBILITY: 32
137+
TRAIN_NUM_POINTS: 12544
138+
OVERSAMPLE_RATIO: 3.0
139+
IMPORTANCE_SAMPLE_RATIO: 0.75
140+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
141+
TOP_GROUNDING_LAYERS: 3
142+
TOP_CAPTION_LAYERS: 3
143+
TOP_CAPTIONING_LAYERS: 3
144+
TOP_RETRIEVAL_LAYERS: 3
145+
TEST:
146+
SEMANTIC_ON: True
147+
INSTANCE_ON: True
148+
PANOPTIC_ON: True
149+
OVERLAP_THRESHOLD: 0.8
150+
OBJECT_MASK_THRESHOLD: 0.8
151+
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
152+
153+
COCO:
154+
INPUT:
155+
MIN_SIZE_TRAIN: 800
156+
MAX_SIZE_TRAIN: 1333
157+
MIN_SIZE_TRAIN_SAMPLING: 'choice'
158+
MIN_SIZE_TEST: 800
159+
MAX_SIZE_TEST: 1333
160+
IMAGE_SIZE: 1024
161+
MIN_SCALE: 0.1
162+
MAX_SCALE: 2.0
163+
DATASET_MAPPER_NAME: "coco_panoptic_lsj"
164+
IGNORE_VALUE: 255
165+
COLOR_AUG_SSD: False
166+
SIZE_DIVISIBILITY: 32
167+
RANDOM_FLIP: "horizontal"
168+
MASK_FORMAT: "polygon"
169+
FORMAT: "RGB"
170+
CROP:
171+
ENABLED: True
172+
DATASET:
173+
DATASET: 'coco'
174+
TEST:
175+
DETECTIONS_PER_IMAGE: 100
176+
NAME: coco_eval
177+
IOU_TYPE: ['bbox', 'segm']
178+
USE_MULTISCALE: false
179+
BATCH_SIZE_TOTAL: 8
180+
MODEL_FILE: ''
181+
AUG:
182+
ENABLED: False
183+
TRAIN:
184+
ASPECT_RATIO_GROUPING: true
185+
BATCH_SIZE_TOTAL: 2
186+
BATCH_SIZE_PER_GPU: 1
187+
SHUFFLE: true
188+
DATALOADER:
189+
FILTER_EMPTY_ANNOTATIONS: False
190+
NUM_WORKERS: 2
191+
LOAD_PROPOSALS: False
192+
SAMPLER_TRAIN: "TrainingSampler"
193+
ASPECT_RATIO_GROUPING: True
194+
195+
VLP:
196+
INPUT:
197+
IMAGE_SIZE: 224
198+
DATASET_MAPPER_NAME: "vlpretrain"
199+
IGNORE_VALUE: 255
200+
COLOR_AUG_SSD: False
201+
SIZE_DIVISIBILITY: 32
202+
MASK_FORMAT: "polygon"
203+
FORMAT: "RGB"
204+
CROP:
205+
ENABLED: True
206+
TRAIN:
207+
BATCH_SIZE_TOTAL: 2
208+
BATCH_SIZE_PER_GPU: 1
209+
TEST:
210+
BATCH_SIZE_TOTAL: 256
211+
DATALOADER:
212+
FILTER_EMPTY_ANNOTATIONS: False
213+
NUM_WORKERS: 16
214+
LOAD_PROPOSALS: False
215+
SAMPLER_TRAIN: "TrainingSampler"
216+
ASPECT_RATIO_GROUPING: True
217+
218+
INPUT:
219+
PIXEL_MEAN: [123.675, 116.280, 103.530]
220+
PIXEL_STD: [58.395, 57.120, 57.375]
221+
222+
DATASETS:
223+
TRAIN: ["coco_2017_train_panoptic_filtall_with_sem_seg_caption_grounding", "vlp_train"]
224+
# open vocabulary segmentation evaluation.
225+
# TEST: ["ade20k_panoptic_val"]
226+
TEST: ["coco_2017_val_panoptic_with_sem_seg", "vlp_captioning_val", "refcocog_val_umd", "vlp_val", "ade20k_panoptic_val"]
227+
# TEST: ["ade20k_panoptic_val", "ade20k_full_sem_seg_val", "sunrgbd_37_val_seg", "scannet_21_val_seg", "scannet_21_panoptic_val", "scannet_41_val_seg", "cityscapes_fine_panoptic_val", "cityscapes_fine_instance_seg_val", "cityscapes_fine_sem_seg_val", "bdd10k_val_sem_seg", "bdd10k_40_panoptic_val"]
228+
# Supervised metrics evaluation.
229+
# TEST: ["vlp_captioning_val", "refcocog_val_umd", "vlp_val"]
230+
SIZE_DIVISIBILITY: 32
231+
PROPOSAL_FILES_TRAIN: []
232+
233+
DATALOADER:
234+
FILTER_EMPTY_ANNOTATIONS: False
235+
NUM_WORKERS: 16
236+
LOAD_PROPOSALS: False
237+
SAMPLER_TRAIN: "TrainingSampler"
238+
ASPECT_RATIO_GROUPING: True
239+
240+
# Detectron2 training config for optimizer and lr scheduler
241+
SOLVER:
242+
BASE_LR: 0.0001
243+
STEPS: [0.88889, 0.96296]
244+
MAX_ITER: 1
245+
GAMMA: 0.1
246+
WARMUP_FACTOR: 1.0
247+
WARMUP_ITERS: 10
248+
WARMUP_METHOD: "linear"
249+
WEIGHT_DECAY: 0.05
250+
OPTIMIZER: "ADAMW"
251+
LR_SCHEDULER_NAME: "WarmupMultiStepLR"
252+
LR_MULTIPLIER:
253+
backbone: 0.1
254+
lang_encoder: 0.1
255+
WEIGHT_DECAY_NORM: 0.0
256+
WEIGHT_DECAY_EMBED: 0.0
257+
CLIP_GRADIENTS:
258+
ENABLED: True
259+
CLIP_TYPE: "full_model"
260+
CLIP_VALUE: 5.0 # 0.01
261+
NORM_TYPE: 2.0
262+
AMP:
263+
ENABLED: True
264+
MAX_NUM_EPOCHS: 50
265+
266+
# Evaluation Dataset
267+
ADE20K:
268+
INPUT:
269+
MIN_SIZE_TRAIN: 640
270+
MIN_SIZE_TRAIN_SAMPLING: "choice"
271+
MIN_SIZE_TEST: 640
272+
MAX_SIZE_TRAIN: 2560
273+
MAX_SIZE_TEST: 2560
274+
MASK_FORMAT: "polygon"
275+
CROP:
276+
ENABLED: True
277+
TYPE: "absolute"
278+
SIZE: (640, 640)
279+
SINGLE_CATEGORY_MAX_AREA: 1.0
280+
COLOR_AUG_SSD: True
281+
SIZE_DIVISIBILITY: 640 # used in dataset mapper
282+
DATASET_MAPPER_NAME: "mask_former_panoptic"
283+
FORMAT: "RGB"
284+
DATASET:
285+
DATASET: 'ade'
286+
TEST:
287+
BATCH_SIZE_TOTAL: 8
288+
289+
290+
REF:
291+
INPUT:
292+
PIXEL_MEAN: [123.675, 116.280, 103.530]
293+
PIXEL_STD: [58.395, 57.120, 57.375]
294+
MIN_SIZE_TEST: 512
295+
MAX_SIZE_TEST: 1024
296+
FORMAT: "RGB"
297+
DATALOADER:
298+
FILTER_EMPTY_ANNOTATIONS: False
299+
NUM_WORKERS: 0
300+
LOAD_PROPOSALS: False
301+
SAMPLER_TRAIN: "TrainingSampler"
302+
ASPECT_RATIO_GROUPING: False
303+
TEST:
304+
BATCH_SIZE_TOTAL: 8
305+
306+
SUN:
307+
INPUT:
308+
PIXEL_MEAN: [123.675, 116.280, 103.530]
309+
PIXEL_STD: [58.395, 57.120, 57.375]
310+
MIN_SIZE_TEST: 512
311+
MAX_SIZE_TEST: 1024
312+
DATALOADER:
313+
FILTER_EMPTY_ANNOTATIONS: False
314+
NUM_WORKERS: 0
315+
LOAD_PROPOSALS: False
316+
SAMPLER_TRAIN: "TrainingSampler"
317+
ASPECT_RATIO_GROUPING: False
318+
TEST:
319+
BATCH_SIZE_TOTAL: 8
320+
321+
SCAN:
322+
INPUT:
323+
PIXEL_MEAN: [123.675, 116.280, 103.530]
324+
PIXEL_STD: [58.395, 57.120, 57.375]
325+
MIN_SIZE_TEST: 512
326+
MAX_SIZE_TEST: 1024
327+
DATALOADER:
328+
FILTER_EMPTY_ANNOTATIONS: False
329+
NUM_WORKERS: 0
330+
LOAD_PROPOSALS: False
331+
SAMPLER_TRAIN: "TrainingSampler"
332+
ASPECT_RATIO_GROUPING: False
333+
TEST:
334+
BATCH_SIZE_TOTAL: 8
335+
336+
BDD:
337+
INPUT:
338+
PIXEL_MEAN: [123.675, 116.280, 103.530]
339+
PIXEL_STD: [58.395, 57.120, 57.375]
340+
MIN_SIZE_TEST: 800
341+
MAX_SIZE_TEST: 1333
342+
DATALOADER:
343+
FILTER_EMPTY_ANNOTATIONS: False
344+
NUM_WORKERS: 0
345+
LOAD_PROPOSALS: False
346+
SAMPLER_TRAIN: "TrainingSampler"
347+
ASPECT_RATIO_GROUPING: False
348+
TEST:
349+
BATCH_SIZE_TOTAL: 8
350+
351+
CITY:
352+
INPUT:
353+
MIN_SIZE_TRAIN: 1024 # !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
354+
MIN_SIZE_TRAIN_SAMPLING: "choice"
355+
MIN_SIZE_TEST: 1024
356+
MAX_SIZE_TRAIN: 4096
357+
MAX_SIZE_TEST: 2048
358+
CROP:
359+
ENABLED: True
360+
TYPE: "absolute"
361+
SIZE: (512, 1024)
362+
SINGLE_CATEGORY_MAX_AREA: 1.0
363+
COLOR_AUG_SSD: True
364+
SIZE_DIVISIBILITY: -1
365+
FORMAT: "RGB"
366+
DATASET_MAPPER_NAME: "mask_former_panoptic"
367+
MASK_FORMAT: "polygon"
368+
TEST:
369+
EVAL_PERIOD: 5000
370+
BATCH_SIZE_TOTAL: 8
371+
AUG:
372+
ENABLED: False
373+
MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
374+
MAX_SIZE: 4096
375+
FLIP: True
376+
DATALOADER:
377+
FILTER_EMPTY_ANNOTATIONS: True
378+
NUM_WORKERS: 4

0 commit comments

Comments
 (0)