Merge branch 'main' of https://github.com/pytorch-labs/gpt-fast

yanboliang · yanboliang · commit d4dbc60a98db · 2024-03-13T23:18:08.000-07:00
diff --git a/generate.py b/generate.py
@@ -16,7 +16,7 @@
 def device_sync(device):
     if "cuda" in device:
         torch.cuda.synchronize(device)
-    elif "cpu" in device:
+    elif ("cpu" in device) or ("mps" in device):
         pass
     else:
         print(f"device={device} is not yet suppported")
@@ -26,6 +26,7 @@ def device_sync(device):
 torch._inductor.config.triton.unique_kernel_names = True
 torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
 
+default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
@@ -206,7 +207,7 @@ def generate(
     }
     return seq, generate_stats
 
-def encode_tokens(tokenizer, string, bos=True, device='cuda'):
+def encode_tokens(tokenizer, string, bos=True, device=default_device):
     tokens = tokenizer.encode(string)
     if bos:
         tokens = [tokenizer.bos_id()] + tokens
@@ -259,7 +260,7 @@ def main(
     profile: Optional[Path] = None,
     draft_checkpoint_path: Optional[Path] = None,
     speculate_k: int = 5,
-    device='cuda',
+    device=default_device,
 ) -> None:
     """Generates text samples based on a pre-trained Transformer model and tokenizer.
     """
@@ -414,7 +415,7 @@ def callback(x):
     parser.add_argument('--profile', type=Path, default=None, help='Profile path.')
     parser.add_argument('--speculate_k', type=int, default=5, help='Speculative execution depth.')
     parser.add_argument('--draft_checkpoint_path', type=Path, default=None, help='Draft checkpoint path.')
-    parser.add_argument('--device', type=str, default="cuda", help='Device to use')
+    parser.add_argument('--device', type=str, default=default_device, help='Device to use')
 
     args = parser.parse_args()
     main(
diff --git a/mixtral-moe/scripts/download.py b/mixtral-moe/scripts/download.py
@@ -13,7 +13,7 @@ def hf_download(repo_id: Optional[str] = None, hf_token: Optional[str] = None) -
     from huggingface_hub import snapshot_download
     os.makedirs(f"checkpoints/{repo_id}", exist_ok=True)
     try:
-        snapshot_download(repo_id, local_dir=f"checkpoints/{repo_id}", local_dir_use_symlinks=False, token=hf_token)
+        snapshot_download(repo_id, local_dir=f"checkpoints/{repo_id}", local_dir_use_symlinks=False, token=hf_token, ignore_patterns="*.safetensors")
     except HTTPError as e:
         if e.response.status_code == 401:
             print("You need to pass a valid `--hf_token=...` to download private checkpoints.")
diff --git a/quantize.py b/quantize.py
@@ -19,6 +19,8 @@
 
 from model import Transformer
 
+default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
 ##### Quantization Primitives ######
 
 def dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype):
@@ -539,7 +541,7 @@ def quantize(
     percdamp: float = .01,
     blocksize: int = 128,
     label: str = '',
-    device: str = 'cuda',
+    device: str = default_device,
 ) -> None:
     assert checkpoint_path.is_file(), checkpoint_path
 
@@ -619,7 +621,7 @@ def quantize(
     parser.add_argument('--percdamp', type=float, default=.01, help='gptq percentage dampening')
     parser.add_argument('--blocksize', type=int, default=128, help='blocksize for gptq')
     parser.add_argument('--label', type=str, default='_', help='label to add to output filename')
-    parser.add_argument('--device', type=str, default='cuda', help='device to use')
+    parser.add_argument('--device', type=str, default=default_device, help='device to use')
 
     args = parser.parse_args()
     quantize(args.checkpoint_path, args.mode, args.groupsize, args.calibration_tasks, args.calibration_limit, args.calibration_seq_length, args.pad_calibration_inputs, args.percdamp, args.blocksize, args.label, args.device)
diff --git a/tp.py b/tp.py
@@ -9,7 +9,11 @@
 import torch
 import torch.distributed as dist
 from torch import nn
-from torch.distributed import _functional_collectives as funcol
+if os.uname().sysname != "Darwin":
+    from torch.distributed import _functional_collectives as funcol
+else:
+    # Distributed is not supported on MacOS
+    funcol = None
 
 from model import Attention, FeedForward, Transformer
 from quantize import WeightOnlyInt4Linear