5
5
import torch
6
6
from transformers import MllamaForConditionalGeneration , MllamaProcessor
7
7
8
+
8
9
# Constants
9
10
DEFAULT_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct"
10
11
11
- def load_model_and_processor (model_name : str ):
12
+
13
+ def load_model_and_processor (model_name : str , hf_token : str ):
12
14
"""
13
15
Load the model and processor based on the 11B or 90B model.
14
16
"""
15
- model = MllamaForConditionalGeneration .from_pretrained (model_name , device_map = "auto" , torch_dtype = torch .bfloat16 )
16
- processor = MllamaProcessor .from_pretrained (model_name )
17
+ model = MllamaForConditionalGeneration .from_pretrained (model_name , device_map = "auto" , torch_dtype = torch .bfloat16 , token = hf_token )
18
+ processor = MllamaProcessor .from_pretrained (model_name , token = hf_token )
17
19
return model , processor
18
20
21
+
19
22
def process_image (image_path : str ) -> PIL_Image .Image :
20
23
"""
21
24
Open and convert an image from the specified path.
@@ -26,6 +29,7 @@ def process_image(image_path: str) -> PIL_Image.Image:
26
29
with open (image_path , "rb" ) as f :
27
30
return PIL_Image .open (f ).convert ("RGB" )
28
31
32
+
29
33
def generate_text_from_image (model , processor , image , prompt_text : str , temperature : float , top_p : float ):
30
34
"""
31
35
Generate text from an image using the model and processor.
@@ -38,22 +42,25 @@ def generate_text_from_image(model, processor, image, prompt_text: str, temperat
38
42
output = model .generate (** inputs , temperature = temperature , top_p = top_p , max_new_tokens = 512 )
39
43
return processor .decode (output [0 ])[len (prompt ):]
40
44
41
- def main (image_path : str , prompt_text : str , temperature : float , top_p : float , model_name : str ):
45
+
46
+ def main (image_path : str , prompt_text : str , temperature : float , top_p : float , model_name : str , hf_token : str ):
42
47
"""
43
48
Call all the functions.
44
49
"""
45
- model , processor = load_model_and_processor (model_name )
50
+ model , processor = load_model_and_processor (model_name , hf_token )
46
51
image = process_image (image_path )
47
52
result = generate_text_from_image (model , processor , image , prompt_text , temperature , top_p )
48
53
print ("Generated Text: " + result )
49
54
55
+
50
56
if __name__ == "__main__" :
51
57
parser = argparse .ArgumentParser (description = "Generate text from an image and prompt using the 3.2 MM Llama model." )
52
- parser .add_argument ("image_path" , type = str , help = "Path to the image file" )
53
- parser .add_argument ("prompt_text" , type = str , help = "Prompt text to describe the image" )
58
+ parser .add_argument ("-- image_path" , type = str , help = "Path to the image file" )
59
+ parser .add_argument ("-- prompt_text" , type = str , help = "Prompt text to describe the image" )
54
60
parser .add_argument ("--temperature" , type = float , default = 0.7 , help = "Temperature for generation (default: 0.7)" )
55
61
parser .add_argument ("--top_p" , type = float , default = 0.9 , help = "Top p for generation (default: 0.9)" )
56
62
parser .add_argument ("--model_name" , type = str , default = DEFAULT_MODEL , help = f"Model name (default: '{ DEFAULT_MODEL } ')" )
63
+ parser .add_argument ("--hf_token" , type = str , required = True , help = "Hugging Face token for authentication" )
57
64
58
65
args = parser .parse_args ()
59
- main (args .image_path , args .prompt_text , args .temperature , args .top_p , args .model_name )
66
+ main (args .image_path , args .prompt_text , args .temperature , args .top_p , args .model_name , args . hf_token )
0 commit comments