Changes from Archana's branch

IanNod · IanNod · commit a0e5031cb813 · 2025-04-23T16:10:58.000-05:00
diff --git a/sharktank/sharktank/evaluate/perplexity_iree.py b/sharktank/sharktank/evaluate/perplexity_iree.py
@@ -271,16 +271,18 @@ def run_iree_module(iree_devices: list[ireert.HalDevice]):
             out_logits = torch.cat(self.out_logits, dim=1)
 
             pad_logits_shape = self.token_ids.shape[1] - out_logits.shape[1]
+            pad_logits = torch.zeros(
+                out_logits.shape[0], pad_logits_shape, out_logits.shape[2]
+
 
-        if self.out_logits.dtype == torch.float8_e4m3fnuz:
-            out_logits_as_int8 = self.out_logits.view(dtype=torch.int8)
-            self.out_logits = torch.cat((out_logits_as_int8, self.pad_logits), 1).to(
-                self.torch_device
-            )
-        else:
-            self.out_logits = torch.cat((self.out_logits, self.pad_logits), 1).to(
-                self.torch_device
             )
+
+            out_logits = torch.cat((out_logits, pad_logits), 1).to(self.torch_device)
+
+            return out_logits
+
+        return with_iree_device_context(run_iree_module, [self.runner.config.device])
+    
     @timeit
     def compute_perplexity(self):
         from torch.nn import CrossEntropyLoss
diff --git a/sharktank/sharktank/evaluate/perplexity_torch.py b/sharktank/sharktank/evaluate/perplexity_torch.py
@@ -98,7 +98,7 @@ def load_model(
 
         theta = dataset.root_theta
 
-        model = PagedLlmModelV1(theta, self.config)
+        model = PagedLlmModelV1(theta, config)
 
         self.generator = TorchGenerator(model, tokenizer)
 
diff --git a/sharktank/sharktank/layers/linear.py b/sharktank/sharktank/layers/linear.py
@@ -77,13 +77,13 @@ def forward(self, x):
         y = ops.linear(x, weight, bias)
         # Unconditionally dequantize.
         if self.q_output is not None:
-            # Probably dont need the custom kernel to return a float32 tensor as a PlanarQuantizedTensor
-            assert y.unpack().qs.dtype == torch.float32
-            y = self.q_output.quantize(y.unpack().qs)
-            if self.fake_quant:
-                return y.unpack().dequant()
-            return y.unpack().qs
-
+            if isinstance(y, QuantizedTensor):
+                # Probably dont need the custom kernel to return a float32 tensor as a PlanarQuantizedTensor
+                assert y.unpack().qs.dtype == torch.float32
+                y = self.q_output.quantize(y.unpack().qs)
+                if self.fake_quant:
+                    return y.unpack().dequant()
+                return y.unpack().qs
         if isinstance(y, QuantizedTensor):
             y = y.unpack().dequant()