1
+ import argparse
1
2
import os
2
3
import sys
3
- import argparse
4
- from PIL import Image as PIL_Image
4
+
5
5
import torch
6
+ from accelerate import Accelerator
7
+ from PIL import Image as PIL_Image
6
8
from transformers import MllamaForConditionalGeneration , MllamaProcessor
7
- from accelerate import Accelerator
8
9
9
10
accelerator = Accelerator ()
10
11
14
15
DEFAULT_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct"
15
16
16
17
17
- def load_model_and_processor (model_name : str , hf_token : str ):
18
+ def load_model_and_processor (model_name : str ):
18
19
"""
19
20
Load the model and processor based on the 11B or 90B model.
20
21
"""
21
- model = MllamaForConditionalGeneration .from_pretrained (model_name , torch_dtype = torch .bfloat16 ,use_safetensors = True , device_map = device ,
22
- token = hf_token )
23
- processor = MllamaProcessor .from_pretrained (model_name , token = hf_token ,use_safetensors = True )
22
+ model = MllamaForConditionalGeneration .from_pretrained (
23
+ model_name ,
24
+ torch_dtype = torch .bfloat16 ,
25
+ use_safetensors = True ,
26
+ device_map = device ,
27
+ )
28
+ processor = MllamaProcessor .from_pretrained (model_name , use_safetensors = True )
24
29
25
- model , processor = accelerator .prepare (model , processor )
30
+ model , processor = accelerator .prepare (model , processor )
26
31
return model , processor
27
32
28
33
@@ -37,37 +42,67 @@ def process_image(image_path: str) -> PIL_Image.Image:
37
42
return PIL_Image .open (f ).convert ("RGB" )
38
43
39
44
40
- def generate_text_from_image (model , processor , image , prompt_text : str , temperature : float , top_p : float ):
45
+ def generate_text_from_image (
46
+ model , processor , image , prompt_text : str , temperature : float , top_p : float
47
+ ):
41
48
"""
42
49
Generate text from an image using the model and processor.
43
50
"""
44
51
conversation = [
45
- {"role" : "user" , "content" : [{"type" : "image" }, {"type" : "text" , "text" : prompt_text }]}
52
+ {
53
+ "role" : "user" ,
54
+ "content" : [{"type" : "image" }, {"type" : "text" , "text" : prompt_text }],
55
+ }
46
56
]
47
- prompt = processor .apply_chat_template (conversation , add_generation_prompt = True , tokenize = False )
57
+ prompt = processor .apply_chat_template (
58
+ conversation , add_generation_prompt = True , tokenize = False
59
+ )
48
60
inputs = processor (image , prompt , return_tensors = "pt" ).to (device )
49
- output = model .generate (** inputs , temperature = temperature , top_p = top_p , max_new_tokens = 512 )
50
- return processor .decode (output [0 ])[len (prompt ):]
61
+ output = model .generate (
62
+ ** inputs , temperature = temperature , top_p = top_p , max_new_tokens = 512
63
+ )
64
+ return processor .decode (output [0 ])[len (prompt ) :]
51
65
52
66
53
- def main (image_path : str , prompt_text : str , temperature : float , top_p : float , model_name : str , hf_token : str ):
67
+ def main (
68
+ image_path : str , prompt_text : str , temperature : float , top_p : float , model_name : str
69
+ ):
54
70
"""
55
- Call all the functions.
71
+ Call all the functions.
56
72
"""
57
- model , processor = load_model_and_processor (model_name , hf_token )
73
+ model , processor = load_model_and_processor (model_name )
58
74
image = process_image (image_path )
59
- result = generate_text_from_image (model , processor , image , prompt_text , temperature , top_p )
75
+ result = generate_text_from_image (
76
+ model , processor , image , prompt_text , temperature , top_p
77
+ )
60
78
print ("Generated Text: " + result )
61
79
62
80
63
81
if __name__ == "__main__" :
64
- parser = argparse .ArgumentParser (description = "Generate text from an image and prompt using the 3.2 MM Llama model." )
82
+ parser = argparse .ArgumentParser (
83
+ description = "Generate text from an image and prompt using the 3.2 MM Llama model."
84
+ )
65
85
parser .add_argument ("--image_path" , type = str , help = "Path to the image file" )
66
- parser .add_argument ("--prompt_text" , type = str , help = "Prompt text to describe the image" )
67
- parser .add_argument ("--temperature" , type = float , default = 0.7 , help = "Temperature for generation (default: 0.7)" )
68
- parser .add_argument ("--top_p" , type = float , default = 0.9 , help = "Top p for generation (default: 0.9)" )
69
- parser .add_argument ("--model_name" , type = str , default = DEFAULT_MODEL , help = f"Model name (default: '{ DEFAULT_MODEL } ')" )
70
- parser .add_argument ("--hf_token" , type = str , required = True , help = "Hugging Face token for authentication" )
86
+ parser .add_argument (
87
+ "--prompt_text" , type = str , help = "Prompt text to describe the image"
88
+ )
89
+ parser .add_argument (
90
+ "--temperature" ,
91
+ type = float ,
92
+ default = 0.7 ,
93
+ help = "Temperature for generation (default: 0.7)" ,
94
+ )
95
+ parser .add_argument (
96
+ "--top_p" , type = float , default = 0.9 , help = "Top p for generation (default: 0.9)"
97
+ )
98
+ parser .add_argument (
99
+ "--model_name" ,
100
+ type = str ,
101
+ default = DEFAULT_MODEL ,
102
+ help = f"Model name (default: '{ DEFAULT_MODEL } ')" ,
103
+ )
71
104
72
105
args = parser .parse_args ()
73
- main (args .image_path , args .prompt_text , args .temperature , args .top_p , args .model_name , args .hf_token )
106
+ main (
107
+ args .image_path , args .prompt_text , args .temperature , args .top_p , args .model_name
108
+ )
0 commit comments