prepare for rebase

Gasoonjia · Gasoonjia · commit 128566c46786 · 2024-09-17T13:57:15.000-07:00
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -128,6 +128,14 @@ def forward(
         decoder_input = self._get_decoder_input(
             tokens, encoder_output=encoder_output, post_tokens=post_tokens
         )
+
+        if input_pos is None:
+            input_pos = torch.arange(
+                decoder_input.shape[1],
+                device=decoder_input.device,
+                dtype=torch.int,
+            )
+
         return self.decoder(decoder_input, input_pos=input_pos)
     
     def setup_caches(self, batch_size, max_seq_len):
@@ -977,98 +985,3 @@ def setup_caches(self, max_batch_size, max_seq_length):
         
 except:
     pass
-
-
-if __name__ == "__main__":
-    def prepare_image(target_h: int, target_w: int) -> torch.Tensor:
-        """Read image into a tensor and resize the image so that it fits in
-        a target_h x target_w canvas.
-
-        Args:
-            image (Image): An Image object.
-            target_h (int): Target height.
-            target_w (int): Target width.
-
-        Returns:
-            torch.Tensor: resized image tensor.
-        """
-        image = Image.open(
-            requests.get(
-                "https://llava-vl.github.io/static/images/view.jpg", stream=True
-            ).raw)
-
-        img = torchvision.transforms.functional.pil_to_tensor(image)
-        # height ratio
-        ratio_h = img.shape[1] / target_h
-        # width ratio
-        ratio_w = img.shape[2] / target_w
-        # resize the image so that it fits in a target_h x target_w canvas
-        ratio = max(ratio_h, ratio_w)
-        output_size = (int(img.shape[1] / ratio), int(img.shape[2] / ratio))
-        img = torchvision.transforms.Resize(size=output_size)(img)
-        return img
-    
-
-    def image_preprocess(img: torch.Tensor, target_h: int, target_w: int, rescale_factor, image_mean, image_std) -> torch.Tensor:
-        # pad the image with median rgb value, to make a square
-        l_pad = (target_w - img.shape[2]) // 2
-        t_pad = (target_h - img.shape[1]) // 2
-        # ceil division
-        r_pad = -((target_w - img.shape[2]) // -2)
-        b_pad = -((target_h - img.shape[1]) // -2)
-
-        torch._check(l_pad >= 0)
-        torch._check(t_pad >= 0)
-        torch._check(r_pad >= 0)
-        torch._check(b_pad >= 0)
-
-        # This is different from the original implementation, due to export limitations.
-        resized = torch.nn.functional.pad(
-            img,
-            (l_pad, r_pad, t_pad, b_pad),
-        )
-        # originally:
-        # resized = F.pad(
-        #     img,
-        #     padding=(l_pad, t_pad, r_pad, b_pad),
-        #     fill=tuple(int(x * 255) for x in self.image_mean),
-        # )
-
-        # TODO: implement _upsample_bicubic_aa.out in portable kernel library.
-        # here padded shape should be max(h, w) x max(h, w)
-        # skipping resize for now due to missing _upsample_bicubic_aa kernel in portable
-        # resized = resize(
-        #     padded,
-        #     size=[
-        #         self.image_processor.crop_size["height"],
-        #         self.image_processor.crop_size["width"],
-        #     ],
-        #     interpolation="bicubic",
-        # )
-        # torch._check(resized.size(1) == self.config.crop_size["height"])
-        # torch._check(resized.size(2) == self.config.crop_size["width"])
-        # print(resized.shape)
-        # cropped = F.center_crop(img, output_size=[w, w])
-        # print(cropped.shape)
-        scaled = resized * rescale_factor
-        # print(scaled)
-        from torchvision.transforms.v2 import functional as tvF
-        normed = tvF.normalize(
-            scaled, image_mean, image_std
-        )
-        # print(normed)
-        return normed.unsqueeze(0)
-
-    pre_tokens = torch.tensor([[    1,   319, 13563,  1546,   263, 12758,  5199,   322,   385, 23116,
-         21082, 20255, 29889,   450, 20255,  4076,  8444, 29892, 13173, 29892,
-           322,  1248,   568,  6089,   304,   278,  5199, 29915, 29879,  5155,
-         29889,  3148,  1001, 29901, 29871]])
-    img = prepare_image(336, 336)
-    post_tokens = torch.tensor([[29871,    13,   462,  9651,  1724,   526,   278,  2712,   306,   881,
-           367,   274,  1300,  2738,  1048,   746,   306,  6493,  1244, 29973,
-           319,  1799,  9047, 13566, 29901]])
-    
-    llava_model = Model.from_params("/home/gasoonjia/torchchat/torchchat/model_params/llava-1.5.json")
-    llava_model.setup_caches(1, 2048)
-    img = image_preprocess(img=img, target_h=336, target_w=336, image_mean=[0.48145466, 0.4578275, 0.40821073], image_std=[0.26862954, 0.26130258, 0.27577711], rescale_factor=0.00392156862745098)
-    llava_model(tokens=pre_tokens, encoder_input=img, post_tokens=post_tokens)