guidance-ai
diff --git a/‎Cargo.lock‎
Lines changed: 32 additions & 17 deletions b/‎Cargo.lock‎
Lines changed: 32 additions & 17 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 4 additions & 0 deletions b/‎docker/Dockerfile‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎llgtrt/Cargo.toml‎
Lines changed: 3 additions & 1 deletion b/‎llgtrt/Cargo.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎llgtrt/py/llama_3.2_vision/input_processor.py‎
Lines changed: 138 additions & 0 deletions b/‎llgtrt/py/llama_3.2_vision/input_processor.py‎
Lines changed: 138 additions & 0 deletions
@@ -16,7 +16,7 @@ exclude = [
 resolver = "2"
 
 [profile.release]
-# debug = 1
+debug = 1
 
 [patch.crates-io]
 derivre = { path = "derivre" }
 
@@ -139,9 +139,9 @@ trtllm-build --checkpoint_dir /models/model-ckpt \
 # Clean up checkpoint (optional)
 rm -rf /models/model-ckpt
 
-# Finally, copy tokenizer.json and tokenizer_config.json
-cp /models/Meta-Llama-3.1-8B-Instruct/tokenizer.json /models/model-engine
-cp /models/Meta-Llama-3.1-8B-Instruct/tokenizer_config.json /models/model-engine
+# Finally, copy tokenizer and preprocessor files to engine folder
+cp /models/Meta-Llama-3.1-8B-Instruct/tokenizer*.json /models/model-engine
+cp /models/Meta-Llama-3.1-8B-Instruct/preprocessor*.json /models/model-engine # this may be missing
 
 # Exit the container
 exit
 
@@ -63,6 +63,10 @@ RUN pip uninstall -y guidance
 
 RUN pip install --upgrade transformers
 
+# TODO test this
+# RUN pip install flash-attn --no-build-isolation
+# RUN pip install qwen_vl_utils
+
 RUN cd /usr/local/lib/python3.12/dist-packages/tensorrt_llm/libs/ && \
     ln -s libnvinfer_plugin_tensorrt_llm.so libnvinfer_plugin_tensorrt_llm.so.10
 
 
@@ -4,7 +4,7 @@ version = "0.16.6"
 edition = "2021"
 
 [dependencies]
-axum = { version = "0.7" }
+axum = { version = "0.7", features = ["macros"] }
 tokio = { version = "1.33.0", features = ["full"] }
 async-stream = "0.3.5"
 anyhow = { version = "1.0.75", features = ["backtrace"] }
@@ -27,3 +27,5 @@ json5 = "0.4.1"
 minijinja-contrib = { version = "2.3.1", features = ["pycompat"] }
 safetensors = "0.5.2"
 memmap2 = "0.9.5"
+pyo3 = { version = "0.23.4", features = ["anyhow", "serde"] }
+num-traits = "0.2.19"
@@ -0,0 +1,138 @@
+# this currently doesn't work due to https://github.com/NVIDIA/TensorRT-LLM/issues/2796
+
+import copy
+import requests
+import llgtrt_base
+import torch
+
+from PIL import Image
+from transformers import MllamaForConditionalGeneration, AutoProcessor
+from llgtrt_native import PluginInit
+
+class Plugin(llgtrt_base.PluginBase):
+    def __init__(self, init: PluginInit):
+        super().__init__(init)
+        self.model = MllamaForConditionalGeneration.from_pretrained(
+            init.hf_model_dir,
+            device_map="cpu",
+            trust_remote_code=True
+        )
+        self.processor = AutoProcessor.from_pretrained(
+            init.hf_model_dir,
+            trust_remote_code=True,
+        )
+
+        # move visual model to gpu
+        self.model.vision_model = self.model.vision_model.to("cpu")
+        self.model.multi_modal_projector = self.model.multi_modal_projector.to("cpu")
+        print("Plugin initialized from HF model directory:", init.hf_model_dir)
+
+    def process_input(
+        self, params: llgtrt_base.ProcessInputParams
+    ) -> llgtrt_base.ProcessInputResult:
+        messages = params.messages
+        print("process_input called, ", messages)
+
+        messages, urls = self._process_messages(messages)
+
+        images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
+
+        prompt = self.processor.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+
+        if len(images) == 0:
+            return llgtrt_base.ProcessInputResult(
+            prompt=prompt,
+            tokens=self.processor.tokenizer.apply_chat_template(messages, tokenize=True,
+                add_generation_prompt=True
+            )
+        )
+
+        inputs = self.processor(
+            images,
+            prompt,
+            add_special_tokens=False,
+            return_tensors="pt"
+        ).to(self.model.vision_model.device)
+
+        vision_outputs = self.model.vision_model(
+            pixel_values=inputs["pixel_values"],
+            aspect_ratio_ids=inputs["aspect_ratio_ids"],
+            aspect_ratio_mask=inputs["aspect_ratio_mask"],
+            output_hidden_states=False,
+            output_attentions=False,
+            return_dict=True,
+        )
+        cross_attention_states = vision_outputs[0]
+        cross_attention_states = self.model.multi_modal_projector(cross_attention_states).reshape(
+            -1, self.model.hidden_size
+        )
+
+        cross_attention_mask = _prepare_cross_attention_mask(
+            inputs["cross_attention_mask"][0],
+            num_vision_tokens=self.model.vision_model.num_patches,
+            dtype=self.model.dtype,
+            max_new_tokens=params.max_new_tokens
+        )
+
+        cross_attention_mask = cross_attention_mask.reshape(-1, cross_attention_states.shape[0])
+
+
+        r = llgtrt_base.ProcessInputResult(
+            prompt=prompt,
+            tokens=inputs["input_ids"].cpu().numpy()[0].tolist()
+        )
+        r.encoder_input_features = cross_attention_states.cuda().half() # Change this to bfloat16 if engine is using bfloat16
+        r.cross_attention_masks = (cross_attention_mask).cuda()
+        r.skip_cross_attn_blocks = torch.Tensor([False]).cuda()
+        r.encoder_output_length = cross_attention_states.shape[0]
+
+        return r
+    
+    def _process_messages(self, messages: list[dict]):
+        urls = []
+        messages = copy.deepcopy(messages)
+        for m in messages:
+            c = m.get("content", None)
+            if isinstance(c, list):
+                parts_to_change = []
+                for part in c:
+                    if part["type"] == "image_url":
+                        url = part["image_url"]["url"]
+                        urls.append(url)
+                        parts_to_change.append(part)
+
+                for part in parts_to_change:
+                    part["type"] = "image"
+                    part.pop("image_url", None)
+
+        return messages, urls
+
+
+def _prepare_cross_attention_mask(
+    cross_attention_mask: torch.Tensor,
+    num_vision_tokens: int,
+    dtype: str,
+    max_new_tokens=100,
+) -> torch.Tensor:
+    text_total_length, *_ = cross_attention_mask.shape
+    cross_attention_mask = cross_attention_mask.repeat_interleave(
+        num_vision_tokens, dim=2)
+
+    cross_attention_mask = cross_attention_mask.view(
+        text_total_length, -1)
+    cross_attention_mask = cross_attention_mask.unsqueeze(1)
+    cross_attention_mask = cross_attention_mask.to(
+        dtype).to(torch.bool).reshape(
+            [-1, cross_attention_mask.shape[-1]])
+
+    # prepare cross_attention_mask for generation phase and concat them
+    tmp_mask = [cross_attention_mask] + [
+        cross_attention_mask[-1:, :] for _ in range(max_new_tokens)
+    ]
+    cross_attention_mask = torch.concat(tmp_mask)
+
+    return cross_attention_mask