-
Notifications
You must be signed in to change notification settings - Fork 1
Description
Dear authors,
Thank you for your great work on this project.
I have been training the model under the same environment described in the paper, but I am not getting consistent results. While I have slightly reduced the training scale (e.g., fewer samples), the performance is significantly lower than expected.
Specifically, I am training from scratch using llava-v1.5-7b, but the current results are as follows:
CIoU: 0.074
GIoU: 0.095
This is quite different from what I observed when training the LISA model from scratch in the past, where I obtained:
CIoU: 0.519
GIoU: 0.414
Is there a chance that I might have misconfigured something? Or is such low performance expected in the early stages of training?
Any insights or suggestions would be greatly appreciated.
Thank you again for your valuable work!
Best regards,
def parse_args(args):
parser = argparse.ArgumentParser(description="READ Model Training")
parser.add_argument("--local_rank", default=0, type=int, help="node rank")
parser.add_argument(
"--version", default="liuhaotian/llava-v1.5-7b"
)
parser.add_argument(
"--precision",
default="bf16",
type=str,
choices=["fp32", "bf16", "fp16"],
help="precision for inference",
)
parser.add_argument("--image_size", default=1024, type=int, help="image size")
parser.add_argument("--model_max_length", default=512, type=int)
parser.add_argument("--lora_r", default=8, type=int)
parser.add_argument(
"--vision_tower", default="openai/clip-vit-large-patch14-336", type=str
)
parser.add_argument("--load_in_8bit", action="store_true", default=False)
parser.add_argument("--load_in_4bit", action="store_true", default=False)
parser.add_argument("--dataset", default="refer_seg||correct_refer_seg||vqa||neg_refer_seg", type=str)
parser.add_argument("--sample_rates", default="9,3,3", type=str)
parser.add_argument(
"--sem_seg_data",
default="ade20k||cocostuff||pascal_part||paco_lvis||mapillary",
type=str,
)
parser.add_argument(
"--refer_seg_data", default="refclef||refcoco||refcoco+||refcocog", type=str
)
parser.add_argument(
"--neg_refer_seg_data", default="R-refcocog||R-refcoco||R-refcoco+", type=str
)
parser.add_argument(
"--correct_refer_seg_data",
default="fprefcocog||fprefcoco||fprefcoco+",
type=str,
)
parser.add_argument("--vqa_data", default="llava_instruct_150k", type=str)
parser.add_argument("--reason_seg_data", default="ReasonSeg|train", type=str)
parser.add_argument("--val_dataset", default="ReasonSeg", type=str)
parser.add_argument("--val_split", default="val", type=str)
parser.add_argument("--dataset_dir", default="./dataset", type=str)
parser.add_argument("--log_base_dir", default="./runs", type=str)
parser.add_argument("--exp_name", default="read_referseg", type=str)
parser.add_argument("--epochs", default=20, type=int)
parser.add_argument("--steps_per_epoch", default=50, type=int) ###
parser.add_argument(
"--batch_size", default=12, type=int, help="batch size per device per step"
)
parser.add_argument(
"--grad_accumulation_steps",
default=1,
type=int,
)
parser.add_argument("--val_batch_size", default=1, type=int)
parser.add_argument("--workers", default=4, type=int)
parser.add_argument("--lr", default=0.0003, type=float) ###
parser.add_argument("--ce_loss_weight", default=1.0, type=float)
parser.add_argument("--dice_loss_weight", default=0.5, type=float)
parser.add_argument("--bce_loss_weight", default=2.0, type=float)
parser.add_argument("--lora_alpha", default=16, type=int)
parser.add_argument("--lora_dropout", default=0.05, type=float)
parser.add_argument("--lora_target_modules", default="q_proj,v_proj", type=str)
parser.add_argument("--beta1", default=0.9, type=float)
parser.add_argument("--beta2", default=0.95, type=float)
parser.add_argument("--num_classes_per_sample", default=1, type=int)
parser.add_argument("--no_eval", action="store_true", default=False)
parser.add_argument("--eval_only", action="store_true", default=False)
parser.add_argument("--vision_pretrained", default="PATH_TO_SAM_ViT-H", type=str)
parser.add_argument("--out_dim", default=256, type=int)
parser.add_argument("--resume", default="", type=str)
parser.add_argument("--print_freq", default=3, type=int)
parser.add_argument("--start_epoch", default=0, type=int)
parser.add_argument("--gradient_checkpointing", action="store_true", default=True)
parser.add_argument("--train_mask_decoder", action="store_true", default=True)
parser.add_argument("--use_mm_start_end", action="store_true", default=True)
parser.add_argument("--auto_resume", action="store_true", default=False)
parser.add_argument("--use_wandb", action="store_true", default=False)
parser.add_argument("--use_released_param", action="store_true", default=False) ###
parser.add_argument(
"--conv_type",
default="llava_v1",
type=str,
choices=["llava_v1", "llava_llama_2"],
)
VERSION="../dataset/llava-v1.5-7b"
VISION_TOWER="../dataset/clip-vit-large-patch14-336"
DATASET_DIR="../dataset"
MODEL_MAX_LENGTH=2048
EXP_NAME="READ-LLaVA-v1.5-7B"
BATCH_SIZE=2
GRAD_ACCUMULATION_STEPS=10
NUM_CLASSES_PER_SAMPLE=2