Downstream (#83)

SNHIPOW · anxiangsir · web-flow · commit 47b0fca0c43d · 2025-01-04T00:12:10.000+08:00
* add downstream branch

* update performance

* fix some bugs

---------

Co-authored-by: Xiang An &lt;anxiangsir@outlook.com&gt;
diff --git a/downstream/README.md b/downstream/README.md
@@ -9,6 +9,12 @@
 [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/multi-label-cluster-discrimination-for-visual/referring-expression-segmentation-on-refcoco)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco?p=multi-label-cluster-discrimination-for-visual) 
 
 
+# MLCD-Seg
+[![Hugging Face](https://img.shields.io/badge/Hugging%20Face-MLCD_SEG_Model-yellow)](https://huggingface.co/DeepGlint-AI/MLCD-Seg-7B)
+
+This repository is dedicated to researching the application of multimodal large models in downstream tasks through an end-to-end approach. At present, the segmentation part has achieved excellent results in the reference segmentation project
+
+
 ## RefCOCO Segmentation Evaluation: 
 
 | Dataset     | Split   | MLCD-seg-7B | EVF-SAM | GLaMM | VisionLLM v2| LISA |
diff --git a/downstream/eval/eval/model_vqa_refcoco.py b/downstream/eval/eval/model_vqa_refcoco.py
@@ -9,7 +9,9 @@
 import numpy as np
 
 import sys
-sys.path.insert(0,'./downstream/llava')
+
+sys.path.insert(0,'.')
+
 from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_SEG_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 from llava.conversation import conv_templates, SeparatorStyle
 from llava.model.builder import load_pretrained_model
diff --git a/downstream/eval/script/eval_refcoco.sh b/downstream/eval/script/eval_refcoco.sh
@@ -1,10 +1,12 @@
 json_path=./eval
 gpu_num=8
-checkpoints_name=./checkpoints/
+
 result_name=./eval/results
+train_image_path=/vlm/kunwu/data/llava_train_img/glamm_data
 
-model_name=llava-seg-DeepGlint-AI_mlcd-vit-large-patch14-336-Qwen_Qwen2.5-7B-Instruct-1.8m
+model_name=DeepGlint-AI/MLCD-Seg-7B
 echo $model_name
 
-./eval/script/eval_multiprocess.sh $checkpoints_name/$model_name $json_path/refcoco.json $result_name/$model_name/refcoco /vlm/kunwu/data/llava_train_img/glamm_data "" $gpu_num 0.2
+./eval/script/eval_multiprocess.sh $model_name $json_path/refcoco.json $result_name/$model_name/refcoco $train_image_path "" $gpu_num 0.2
 python ./eval/eval/evaluate_refcoco.py --result-dir $result_name/$model_name/refcoco
+
diff --git a/downstream/llava/model/builder.py b/downstream/llava/model/builder.py
@@ -54,7 +54,9 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
         model_name = f"llava_qwen_{model_name}"
     if "llava-seg-DeepGlint" in model_name:
         model_name = f"llava_qwen_{model_name}"
-
+    if "MLCD" in model_name:
+        model_name = f"llava_qwen_{model_name}"
+        
     if "llava" in model_name.lower() or is_multimodal:
         # Load LLaVA model
         if "lora" in model_name.lower() and model_base is None: