@@ -81,10 +81,11 @@ It’s so memory efficient, that you can run it even in a free Google Colab.
8181<summary >Python Code</summary >
8282
8383``` python
84- # Make sure we are running the latest version of Transformers
85- ! pip install git+ https:// github.com/ huggingface/ transformers.git
84+ # Install transformers from `main` or from this stable branch:
85+ ! pip install git+ https:// github.com/ huggingface/ transformers@ v4.49.0 - SmolVLM - 2
8686
8787from transformers import AutoProcessor, AutoModelForImageTextToText
88+ import torch
8889
8990model_path = " HuggingFaceTB/SmolVLM2-2.2B-Instruct"
9091processor = AutoProcessor.from_pretrained(model_path)
@@ -110,7 +111,7 @@ inputs = processor.apply_chat_template(
110111 tokenize = True ,
111112 return_dict = True ,
112113 return_tensors = " pt" ,
113- ).to(model.device)
114+ ).to(model.device, dtype = torch.bfloat16 )
114115
115116generated_ids = model.generate(** inputs, do_sample = False , max_new_tokens = 64 )
116117generated_texts = processor.batch_decode(
@@ -193,27 +194,25 @@ The easiest way to run inference with the SmolVLM2 models is through the convers
193194You can load the model as follows.
194195
195196``` python
196-
197- # Make sure we are running the latest version of Transformers
198- ! pip install git+ https:// github.com/ huggingface/ transformers.git
197+ # Install transformers from `main` or from this stable branch:
198+ ! pip install git+ https:// github.com/ huggingface/ transformers@ v4.49.0- SmolVLM- 2
199199
200200from transformers import AutoProcessor, AutoModelForImageTextToText
201+ import torch
201202
202203processor = AutoProcessor.from_pretrained(model_path)
203204model = AutoModelForImageTextToText.from_pretrained(
204205 model_path,
205206 torch_dtype = torch.bfloat16,
206207 _attn_implementation = " flash_attention_2"
207- ).to(DEVICE )
208+ ).to(" cuda " )
208209```
209210
210211#### Video Inference
211212
212213You can pass videos through a chat template by passing in ` {"type": "video", "path": {video_path} ` . See below for a complete example.
213214
214215``` python
215- import torch
216-
217216messages = [
218217 {
219218 " role" : " user" ,
@@ -230,7 +229,7 @@ inputs = processor.apply_chat_template(
230229 tokenize = True ,
231230 return_dict = True ,
232231 return_tensors = " pt" ,
233- ).to(model.device)
232+ ).to(model.device, dtype = torch.bfloat16 )
234233
235234generated_ids = model.generate(** inputs, do_sample = False , max_new_tokens = 64 )
236235generated_texts = processor.batch_decode(
@@ -245,19 +244,16 @@ print(generated_texts[0])
245244
246245#### Multiple Image Inference
247246
248- In addition to video, SmolVLM2 supports multi-image conversations. You can use the same API through the chat template.
247+ In addition to video, SmolVLM2 supports multi-image conversations. You can use the same API through the chat template, providing each image using a filesystem path, an URL, or a ` PIL.Image ` object:
249248
250249``` python
251- import torch
252-
253-
254250messages = [
255251 {
256252 " role" : " user" ,
257253 " content" : [
258254 {" type" : " text" , " text" : " What are the differences between these two images?" },
259- {" type" : " image" , " path " : " image_1.png " },
260- {" type" : " image" , " path " : " image_2.png " }
255+ {" type" : " image" , " url " : " https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg " },
256+ {" type" : " image" , " url " : " https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg " },
261257 ]
262258 },
263259]
@@ -268,7 +264,7 @@ inputs = processor.apply_chat_template(
268264 tokenize = True ,
269265 return_dict = True ,
270266 return_tensors = " pt" ,
271- ).to(model.device)
267+ ).to(model.device, dtype = torch.bfloat16 )
272268
273269generated_ids = model.generate(** inputs, do_sample = False , max_new_tokens = 64 )
274270generated_texts = processor.batch_decode(
0 commit comments