misc

Jack-Khuu · Jack-Khuu · commit 2530e71879f4 · 2024-12-02T16:44:37.000-08:00
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -731,6 +731,7 @@ jobs:
 
           git clone https://github.com/ggerganov/llama.cpp.git
           pushd llama.cpp
+          git checkout 64ed2091b24b2f9747148fdf49a34ed5938762c3
           make
           popd
 
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -373,6 +373,8 @@ def _load_model_gguf(builder_args: BuilderArgs) -> Model:
         kwargs = {}
     else:
         kwargs = builder_args.gguf_kwargs
+
+    kwargs.setdefault("device", builder_args.device)
     model = Model.from_gguf(builder_args.gguf_path, **kwargs)
     return model
 
diff --git a/torchchat/utils/gguf_loader.py b/torchchat/utils/gguf_loader.py
@@ -570,6 +570,7 @@ def load_model_and_state_dict(
     load_state_dict: bool = True,
     load_as_quantized: bool = True,
     inner_k_tiles=8,
+    device="cpu",
 ) -> torch.nn.Module:
     """
     Parses the GGUF file and returns an nn.Module on meta device along with a state_dict
@@ -609,9 +610,15 @@ def load_model_and_state_dict(
                 q, s, z = Q4_0.unpack(t)
                 scales_and_zeros = pack_scales_and_zeros(s, z)
                 q_uint8 = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8)
-                weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-                    q_uint8, inner_k_tiles
-                )
+                
+                if torch.device(device).type == "cpu":
+                    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack_for_cpu(
+                        q_uint8, inner_k_tiles 
+                    )
+                else:
+                    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+                        q_uint8, inner_k_tiles
+                    )
                 state_dict[f"{fqn}.weight"] = weight_int4pack
                 state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros