WorldMind/server.py at main · zjunlp/WorldMind · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from flask import Flask, request, jsonify
import os
from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig, pipeline, Gemma3ForConditionalGeneration
import torch
from PIL import Image

max_token = 1024
# model_path = "microsoft/Phi-4-multimodal-instruct"
# model_path = 'AIDC-AI/Ovis2-16B'
# model_path = 'AIDC-AI/Ovis2-34B'
model_path = 'google/gemma-3-12b-it'

# Load the custom model
class CustomModel:
    def __init__(self, model_path, language_only):
        self.model_path = model_path
        self.language_only = language_only
        self.model_type = 'custom'

        if 'Ovis' in model_path:
            self.model = AutoModelForCausalLM.from_pretrained(model_path,
                                             torch_dtype=torch.bfloat16,
                                             multimodal_max_length=20000,
                                             trust_remote_code=True,
                                             attn_implementation="eager",
                                             device_map='auto')
            self.text_tokenizer = self.model.get_text_tokenizer()
            self.visual_tokenizer = self.model.get_visual_tokenizer()
        elif 'Phi-4' in model_path:
            self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.bfloat16,
                trust_remote_code=True,
                device_map='auto',
                attn_implementation="flash_attention_2"
            )
            self.generation_config = GenerationConfig.from_pretrained(model_path)
        elif 'gemma' in model_path:
            self.model = Gemma3ForConditionalGeneration.from_pretrained(
                model_path, device_map="auto", torch_dtype=torch.bfloat16,
                attn_implementation="eager"
            )
            self.processor = AutoProcessor.from_pretrained(model_path)


    def respond(self, prompt, image_path=None):
        if 'microsoft/Phi-4' in self.model_path:
            user_prompt = '<|user|>'
            assistant_prompt = '<|assistant|>'
            prompt_suffix = '<|end|>'
            formatted_prompt = f'{user_prompt}<|image_1|>{prompt}{prompt_suffix}{assistant_prompt}'

            image = Image.open(image_path)
            inputs = self.processor(text=formatted_prompt, images=image, return_tensors='pt').to(self.model.device)
            with torch.no_grad():
                generate_ids = self.model.generate(
                    **inputs,
                    max_new_tokens=max_token,  # Adjust as needed
                    temperature=0.0,      # Adjust as needed
                    generation_config=self.generation_config,
                )

            generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
            response = self.processor.batch_decode(
                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )[0]
        elif 'Ovis' in self.model_path:
            images = [Image.open(image_path)]
            max_partition = 9
            query = f'<image>\n{prompt}'
            prompt, input_ids, pixel_values = self.model.preprocess_inputs(query, images, max_partition=max_partition)
            attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
            input_ids = input_ids.unsqueeze(0).to(device=self.model.device)
            attention_mask = attention_mask.unsqueeze(0).to(device=self.model.device)
            if pixel_values is not None:
                pixel_values = pixel_values.to(dtype=self.visual_tokenizer.dtype, device=self.visual_tokenizer.device)
            pixel_values = [pixel_values]
            # generate output
            with torch.inference_mode():
                gen_kwargs = dict(
                    max_new_tokens=max_token,
                    do_sample=False,
                    temperature=0.0,
                    repetition_penalty=None,
                    eos_token_id=self.model.generation_config.eos_token_id,
                    pad_token_id=self.text_tokenizer.pad_token_id,
                    use_cache=True
                )
                output_ids = self.model.generate(input_ids,  pixel_values=pixel_values, attention_mask=attention_mask, **gen_kwargs)[0]
                response = self.text_tokenizer.decode(output_ids, skip_special_tokens=True)
        else:
            messages = [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": "You are a helpful assistant."}]
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "url": image_path},
                        {"type": "text", "text": prompt}
                    ]
                }
            ]
            inputs = self.processor.apply_chat_template(
                        messages, add_generation_prompt=True, tokenize=True,
                            return_dict=True, return_tensors="pt"
                        ).to(self.model.device)

            input_len = inputs["input_ids"].shape[-1]
            print(input_len)
            with torch.inference_mode():
                generation = self.model.generate(**inputs, max_new_tokens=max_token, do_sample=False, temperature=0.0, use_cache=True)
                generation = generation[0][input_len:]

            response = self.processor.decode(generation, skip_special_tokens=True)
        return response

# Initialize Flask app and model
app = Flask(__name__)
UPLOAD_FOLDER = "uploads"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

model = CustomModel(model_path=model_path, language_only=False)

@app.route('/process', methods=['POST'])
def process_request():
    if 'image' not in request.files or 'sentence' not in request.form:
        return jsonify({'error': 'Missing image or sentence'}), 400

    image = request.files['image']
    sentence = request.form['sentence']

    if image.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    # Save the image temporarily
    image_path = os.path.join(UPLOAD_FOLDER, image.filename)
    image.save(image_path)

    # Generate response from the model
    model_response = model.respond(sentence, image_path=image_path)

    return jsonify({'response': model_response})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=23333)