Merge branch 'main' into refactor/distributed_inference_without_abstraction

Jack-Khuu · web-flow · commit daf902c503dd · 2024-12-18T19:24:12.000-08:00
diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-7d7c14e898eca3fe66138d2a9445755a9270b800
+2e032c6b0de960dee554dcb08126ace718b14c6d
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
@@ -44,31 +44,20 @@ fi
 
 echo "Using pip executable: $PIP_EXECUTABLE"
 
-#
-# First install requirements in install/requirements.txt. Older torch may be
-# installed from the dependency of other models. It will be overridden by
-# newer version of torch nightly installed later in this script.
-#
-
-(
-  set -x
-  $PIP_EXECUTABLE install -r install/requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/cu121
-)
-
 # Since torchchat often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features. The PYTORCH_NIGHTLY_VERSION value should
 # agree with the third-party/pytorch pinned submodule commit.
 #
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241213
+PYTORCH_NIGHTLY_VERSION=dev20241218
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241213
+VISION_NIGHTLY_VERSION=dev20241218
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20241126
+TUNE_NIGHTLY_VERSION=dev20241218
 
 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
 (
@@ -96,6 +85,16 @@ REQUIREMENTS_TO_INSTALL=(
   torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}"
 )
 
+#
+# First install requirements in install/requirements.txt. Older torch may be
+# installed from the dependency of other models. It will be overridden by
+# newer version of torch nightly installed later in this script.
+#
+(
+  set -x
+  $PIP_EXECUTABLE install -r install/requirements.txt --extra-index-url "${TORCH_NIGHTLY_URL}"
+)
+
 # Install the requirements. --extra-index-url tells pip to look for package
 # versions on the provided URL if they aren't available on the default URL.
 (
diff --git a/torchchat/utils/gguf_loader.py b/torchchat/utils/gguf_loader.py
@@ -24,6 +24,8 @@
     pack_scales_and_zeros,
 )
 
+from torchao.dtypes.utils import is_device
+
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -128,6 +130,7 @@ def linear_int4(input, weight_int4pack, scales_and_zeros, out_features, groupsiz
             groupsize,
             scales_and_zeros,
         )
+
     new_shape = origin_input_size[:-1] + (out_features,)
     c = c.reshape(new_shape)
     return c
@@ -178,16 +181,27 @@ def __init__(
         ), "must specify both weights and scales_and_zeros, or neither"
 
         if weight is None:
-            weight = torch.empty(
-                (
-                    out_features // 8,
-                    in_features // (inner_k_tiles * 16),
-                    32,
-                    inner_k_tiles // 2,
-                ),
-                dtype=torch.int32,
-                device=device,
-            )
+            if is_device(device, "cpu"):
+                weight = torch.empty(
+                    (
+                        out_features,
+                        in_features // 2,
+                    ),
+                    dtype=torch.uint8,
+                    device=device,
+                )
+            else:
+                weight = torch.empty(
+                    (
+                        out_features // 8,
+                        in_features // (inner_k_tiles * 16),
+                        32,
+                        inner_k_tiles // 2,
+                    ),
+                    dtype=torch.int32,
+                    device=device,
+                )
+
             scales_and_zeros = torch.empty(
                 (in_features // groupsize, out_features, 2),
                 dtype=get_precision(),
@@ -223,12 +237,17 @@ def _prepare_weight_and_scales_and_zeros(
         weight_int32, scales_and_zeros = group_quantize_tensor(
             weight_bf16, n_bit=4, groupsize=groupsize
         )
-        weight_uint8 = (weight_int32[::, ::2] << 4 | weight_int32[::, 1::2]).to(
-            torch.uint8
-        )
-        weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-            weight_uint8, inner_k_tiles
-        )
+        if is_device(weight_int32.device.type, "cpu"):
+            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack_for_cpu(
+                weight_int32, inner_k_tiles
+            )
+        else:
+            weight_uint8 = (weight_int32[::, ::2] << 4 | weight_int32[::, 1::2]).to(
+                torch.uint8
+            )
+            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+                weight_uint8, inner_k_tiles
+            )
         return weight_int4pack, scales_and_zeros
 
     @classmethod
@@ -609,17 +628,14 @@ def load_model_and_state_dict(
             if load_state_dict:
                 q, s, z = Q4_0.unpack(t)
                 scales_and_zeros = pack_scales_and_zeros(s, z)
-                q_uint8 = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8)
-
-                if torch.device(device).type == "cpu":
-                    weight_int4pack = (
-                        torch.ops.aten._convert_weight_to_int4pack_for_cpu(
-                            q, inner_k_tiles
-                        )
+                if is_device(q.device.type, "cpu"):
+                    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack_for_cpu(
+                        q, inner_k_tiles
                     )
                 else:
+                    q_tmp = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8)
                     weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-                        q_uint8, inner_k_tiles
+                        q_tmp, inner_k_tiles
                     )
                 state_dict[f"{fqn}.weight"] = weight_int4pack
                 state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros
@@ -632,7 +648,7 @@ def load_model_and_state_dict(
                     in_features=in_features,
                     out_features=out_features,
                     bias=False,
-                    device="meta",
+                    device="cpu",
                     groupsize=Q4_0.groupsize,
                     inner_k_tiles=inner_k_tiles,
                 ),
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
@@ -932,13 +932,15 @@ def quantized_model(self) -> nn.Module:
         libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*")
         libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
         torch.ops.load_library(libs[0])
+        print("Loaded torchao cpu ops.")
     except Exception as e:
         print("Unabled to load torchao cpu ops library. Slow fallback kernels will be used.")
 
     try:
         libname = "libtorchao_ops_mps_aten.dylib"
         libpath = f"{torchao_build_path}/cmake-out/lib/{libname}"
         torch.ops.load_library(libpath)
+        print("Loaded torchao mps ops.")
     except Exception as e:
         print("Unabled to load torchao mps ops library.")
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-7d7c14e898eca3fe66138d2a9445755a9270b800`
	`1`	`+2e032c6b0de960dee554dcb08126ace718b14c6d`