llm-jp
diff --git a/‎examples/EvoVLM_JP_v1_7B.py‎
Lines changed: 11 additions & 3 deletions b/‎examples/EvoVLM_JP_v1_7B.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎examples/GPT_4o.py‎
Lines changed: 9 additions & 2 deletions b/‎examples/GPT_4o.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎examples/InternVL2_8B.py‎
Lines changed: 5 additions & 4 deletions b/‎examples/InternVL2_8B.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/Llama_3_2_11B_Vision_Instruct.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/Llama_3_2_11B_Vision_Instruct.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/Llama_3_2_11B_Vision_Instruct_Swallow_8B_Merge.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/Llama_3_2_11B_Vision_Instruct_Swallow_8B_Merge.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/Llama_3_EZO_VLM_1.py‎
Lines changed: 5 additions & 8 deletions b/‎examples/Llama_3_EZO_VLM_1.py‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎examples/Llama_3_EvoVLM_JP_v2.py‎
Lines changed: 5 additions & 8 deletions b/‎examples/Llama_3_EvoVLM_JP_v2.py‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎examples/Pangea_7B_hf.py‎
Lines changed: 3 additions & 6 deletions b/‎examples/Pangea_7B_hf.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎examples/Pixtral_12B_2409.py‎
Lines changed: 7 additions & 2 deletions b/‎examples/Pixtral_12B_2409.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎examples/Qwen2_VL_7B_Instruct.py‎
Lines changed: 5 additions & 2 deletions b/‎examples/Qwen2_VL_7B_Instruct.py‎
Lines changed: 5 additions & 2 deletions
@@ -2,6 +2,7 @@
 from transformers import AutoModelForVision2Seq, AutoProcessor
 import torch
 from base_vlm import BaseVLM
+from utils import GenerationConfig
 
 
 class VLM(BaseVLM):
@@ -15,8 +16,15 @@ def __init__(self) -> None:
         self.processor = AutoProcessor.from_pretrained(self.model_id)
         self.model.to(self.device)
 
-    def generate(self, image, text: str, max_new_tokens: int = 256):
-        text = f"<image>{text}"
+    def generate(
+        self, image, text: str, gen_kwargs: GenerationConfig = GenerationConfig()
+    ):
+        text = text.replace("<image>", "")
+        if isinstance(image, list):
+            text = "<image>" * len(image) + f"{text}"
+        else:
+            text = f"<image>{text}"
+
         messages = [
             {
                 "role": "system",
@@ -29,7 +37,7 @@ def generate(self, image, text: str, max_new_tokens: int = 256):
             messages, return_tensors="pt"
         )
         output_ids = self.model.generate(
-            **inputs.to(self.device), max_new_token=max_new_tokens
+            **inputs.to(self.device), **gen_kwargs.__dict__
         )
         output_ids = output_ids[:, inputs.input_ids.shape[1] :]
         generated_text = self.processor.batch_decode(
 
@@ -3,6 +3,7 @@
 from io import BytesIO
 import base64
 from base_vlm import BaseVLM
+from utils import GenerationConfig
 
 
 def encode_image_to_base64(image):
@@ -22,7 +23,9 @@ def __init__(self) -> None:
             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
         )
 
-    def generate(self, image, text: str, max_new_tokens: int = 256):
+    def generate(
+        self, image, text: str, gen_kwargs: GenerationConfig = GenerationConfig()
+    ):
         if "<image>" in text:
             text = text.replace("<image>", "")
         message = []
@@ -70,7 +73,11 @@ def generate(self, image, text: str, max_new_tokens: int = 256):
             ]
         try:
             response = self.client.chat.completions.create(
-                model=self.model_id, messages=message, max_tokens=max_new_tokens
+                model=self.model_id,
+                messages=message,
+                max_tokens=gen_kwargs.max_new_tokens,
+                temperature=gen_kwargs.temperature,
+                top_p=gen_kwargs.top_p,
             )
             return response.choices[0].message.content
         except Exception as e:
 
@@ -5,6 +5,7 @@
 from transformers import AutoModel, AutoTokenizer
 from typing import Union
 from base_vlm import BaseVLM
+from utils import GenerationConfig
 
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -135,7 +136,9 @@ def __init__(self) -> None:
             self.model_id, trust_remote_code=True, use_fast=False
         )
 
-    def generate(self, image, text: str, max_new_tokens: int = 256):
+    def generate(
+        self, image, text: str, gen_kwargs: GenerationConfig = GenerationConfig()
+    ):
         text = text.replace("<image>", "")
         if "<image>" not in text:
             if isinstance(image, list):
@@ -164,14 +167,12 @@ def generate(self, image, text: str, max_new_tokens: int = 256):
                 load_image(image, max_num=12).to(self.model.device).to(self.model.dtype)
             )
 
-        generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False)
-
         response = self.model.chat(
             self.tokenizer,
             pixel_values,
             text,
-            generation_config,
             num_patches_list=num_patches_list,
+            generation_config=gen_kwargs.__dict__,
         )
         generated_text = response
         return generated_text
 
@@ -3,6 +3,7 @@
 from transformers import MllamaForConditionalGeneration, AutoProcessor
 from typing import Union
 from base_vlm import BaseVLM
+from utils import GenerationConfig
 
 
 class VLM(BaseVLM):
@@ -20,7 +21,7 @@ def generate(
         self,
         images: Union[Image.Image, list[Image.Image]],
         text: str,
-        max_new_tokens: int = 256,
+        gen_kwargs: GenerationConfig = GenerationConfig(),
     ):
         if "<image>" in text:
             text = text.replace("<image>", "")
@@ -41,7 +42,7 @@ def generate(
         inputs = self.processor(
             images, input_text, add_special_tokens=False, return_tensors="pt"
         ).to(self.model.device)
-        output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        output_ids = self.model.generate(**inputs, **gen_kwargs.__dict__)
         generated_ids = [
             output_ids[len(input_ids) :]
             for input_ids, output_ids in zip(inputs.input_ids, output_ids)
 
@@ -3,6 +3,7 @@
 from transformers import MllamaForConditionalGeneration, AutoProcessor
 from typing import Union
 from base_vlm import BaseVLM
+from utils import GenerationConfig
 
 
 class VLM(BaseVLM):
@@ -20,7 +21,7 @@ def generate(
         self,
         images: Union[Image.Image, list[Image.Image]],
         text: str,
-        max_new_tokens: int = 256,
+        gen_kwargs: GenerationConfig = GenerationConfig(),
     ):
         if "<image>" in text:
             text = text.replace("<image>", "")
@@ -41,7 +42,7 @@ def generate(
         inputs = self.processor(
             images, input_text, add_special_tokens=False, return_tensors="pt"
         ).to(self.model.device)
-        output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        output_ids = self.model.generate(**inputs, **gen_kwargs.__dict__)
         generated_ids = [
             output_ids[len(input_ids) :]
             for input_ids, output_ids in zip(inputs.input_ids, output_ids)
 
@@ -7,6 +7,7 @@
 )
 from mantis.models.mllava.utils import conv_templates
 from base_vlm import BaseVLM
+from utils import GenerationConfig
 
 # 1. Set the system prompt
 conv_llama_3_elyza = Conversation(
@@ -33,13 +34,9 @@ def __init__(self) -> None:
         )
         self.processor.tokenizer.pad_token = self.processor.tokenizer.eos_token
 
-    def generate(self, image, text: str, max_new_tokens: int = 256):
-        generation_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "num_beams": 1,
-            "do_sample": False,
-            "no_repeat_ngram_size": 3,
-        }
+    def generate(
+        self, image, text: str, gen_kwargs: GenerationConfig = GenerationConfig()
+    ):
         if isinstance(image, list):
             if "<image>" not in text:
                 text = "<image> " * len(image) + "\n" + text
@@ -49,7 +46,7 @@ def generate(self, image, text: str, max_new_tokens: int = 256):
                 text = "<image>\n" + text
             images = [image]
         response, history = chat_mllava(
-            text, images, self.model, self.processor, **generation_kwargs
+            text, images, self.model, self.processor, **gen_kwargs.__dict__
         )
         return response
 
 
@@ -7,6 +7,7 @@
 )
 from mantis.models.mllava.utils import conv_templates
 from base_vlm import BaseVLM
+from utils import GenerationConfig
 
 # 1. Set the system prompt
 conv_llama_3_elyza = Conversation(
@@ -33,13 +34,9 @@ def __init__(self) -> None:
         )
         self.processor.tokenizer.pad_token = self.processor.tokenizer.eos_token
 
-    def generate(self, image, text: str, max_new_tokens: int = 256):
-        generation_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "num_beams": 1,
-            "do_sample": False,
-            "no_repeat_ngram_size": 3,
-        }
+    def generate(
+        self, image, text: str, gen_kwargs: GenerationConfig = GenerationConfig()
+    ):
         if isinstance(image, list):
             if "<image>" not in text:
                 text = "<image> " * len(image) + "\n" + text
@@ -49,7 +46,7 @@ def generate(self, image, text: str, max_new_tokens: int = 256):
                 text = "<image>\n" + text
             images = [image]
         response, history = chat_mllava(
-            text, images, self.model, self.processor, **generation_kwargs
+            text, images, self.model, self.processor, **gen_kwargs.__dict__
         )
         return response
 
 
@@ -4,6 +4,7 @@
 from PIL import Image
 from typing import Union
 from base_vlm import BaseVLM
+from utils import GenerationConfig
 
 
 class VLM(BaseVLM):
@@ -21,7 +22,7 @@ def generate(
         self,
         images: Union[Image.Image, list[Image.Image]],
         text: str,
-        max_new_tokens: int = 256,
+        gen_kwargs: GenerationConfig = GenerationConfig(),
     ):
         if isinstance(images, list):
             prompt_template = (
@@ -39,11 +40,7 @@ def generate(
         ).to("cuda", torch.float16)
         output = self.model.generate(
             **model_inputs,
-            max_new_tokens=max_new_tokens,
-            min_new_tokens=32,
-            temperature=1.0,
-            top_p=0.9,
-            do_sample=True,
+            **gen_kwargs.__dict__,
         )
         output = output[0]
         result = self.processor.decode(
 
@@ -5,6 +5,7 @@
 import base64
 from io import BytesIO
 from base_vlm import BaseVLM
+from utils import GenerationConfig
 
 
 def image_to_base64(img):
@@ -42,7 +43,7 @@ def generate(
         self,
         images: Union[Image.Image, list[Image.Image]],
         text: str,
-        max_new_tokens: int = 256,
+        gen_kwargs: GenerationConfig = GenerationConfig(),
     ):
         if isinstance(images, list):
             content = [image_to_content(image) for image in images]
@@ -57,7 +58,11 @@ def generate(
             }
         ]
 
-        sampling_params = SamplingParams(max_tokens=max_new_tokens)
+        sampling_params = SamplingParams(
+            max_tokens=gen_kwargs.max_new_tokens,
+            temperature=gen_kwargs.temperature,
+            top_p=gen_kwargs.top_p,
+        )
         outputs = self.model.chat(
             messages,
             sampling_params=sampling_params,
 
@@ -3,6 +3,7 @@
 import base64
 from qwen_vl_utils import process_vision_info
 from base_vlm import BaseVLM
+from utils import GenerationConfig
 
 
 class VLM(BaseVLM):
@@ -18,7 +19,9 @@ def __init__(self) -> None:
             self.model_id, min_pixels=min_pixels, max_pixels=max_pixels
         )
 
-    def generate(self, image, text: str, max_new_tokens: int = 256):
+    def generate(
+        self, image, text: str, gen_kwargs: GenerationConfig = GenerationConfig()
+    ):
         if "<image>" in text:
             text = text.replace("<image>", "")
         message = []
@@ -75,7 +78,7 @@ def generate(self, image, text: str, max_new_tokens: int = 256):
         )
 
         inputs = inputs.to(self.model.device)
-        output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        output_ids = self.model.generate(**inputs, **gen_kwargs.__dict__)
         generated_ids = [
             output_ids[len(input_ids) :]
             for input_ids, output_ids in zip(inputs.input_ids, output_ids)