-
Notifications
You must be signed in to change notification settings - Fork 6.5k
Speedup model loading by 4-5x ⚡ #11904
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
7a9c448
8385f45
9e4873b
20b1155
b776aaa
e364dfd
ea446b1
e736b09
4c81c96
591655e
582af9b
a6ee660
bbbc4c0
39f0850
58fcfdc
b0552bb
275e470
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -16,9 +16,10 @@ | |||
|
|
||||
| import importlib | ||||
| import inspect | ||||
| import math | ||||
| import os | ||||
| from array import array | ||||
| from collections import OrderedDict | ||||
| from collections import OrderedDict, defaultdict | ||||
| from pathlib import Path | ||||
| from typing import Dict, List, Optional, Union | ||||
| from zipfile import is_zipfile | ||||
|
|
@@ -38,6 +39,7 @@ | |||
| _get_model_file, | ||||
| deprecate, | ||||
| is_accelerate_available, | ||||
| is_accelerate_version, | ||||
| is_gguf_available, | ||||
| is_torch_available, | ||||
| is_torch_version, | ||||
|
|
@@ -243,15 +245,19 @@ def load_model_dict_into_meta( | |||
| if keep_in_fp32_modules is not None and any( | ||||
| module_to_keep_in_fp32 in param_name.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules | ||||
| ): | ||||
| param = param.to(torch.float32) | ||||
| param = param.to(torch.float32, non_blocking=True) | ||||
| set_module_kwargs["dtype"] = torch.float32 | ||||
| # For quantizers have save weights using torch.float8_e4m3fn | ||||
| elif hf_quantizer is not None and param.dtype == getattr(torch, "float8_e4m3fn", None): | ||||
| pass | ||||
| else: | ||||
| param = param.to(dtype) | ||||
| param = param.to(dtype, non_blocking=True) | ||||
| set_module_kwargs["dtype"] = dtype | ||||
|
|
||||
| if is_accelerate_version(">=", "1.9.0.dev0"): | ||||
a-r-r-o-w marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||
| set_module_kwargs["non_blocking"] = True | ||||
| set_module_kwargs["_empty_cache"] = False | ||||
|
|
||||
| # For compatibility with PyTorch load_state_dict which converts state dict dtype to existing dtype in model, and which | ||||
| # uses `param.copy_(input_param)` that preserves the contiguity of the parameter in the model. | ||||
| # Reference: https://github.com/pytorch/pytorch/blob/db79ceb110f6646523019a59bbd7b838f43d4a86/torch/nn/modules/module.py#L2040C29-L2040C29 | ||||
|
|
@@ -265,7 +271,7 @@ def load_model_dict_into_meta( | |||
|
|
||||
| if old_param is not None: | ||||
| if dtype is None: | ||||
| param = param.to(old_param.dtype) | ||||
| param = param.to(old_param.dtype, non_blocking=True) | ||||
|
|
||||
| if old_param.is_contiguous(): | ||||
| param = param.contiguous() | ||||
|
|
@@ -520,3 +526,64 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): | |||
| parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights | ||||
|
|
||||
| return parsed_parameters | ||||
|
|
||||
|
|
||||
| def _find_mismatched_keys( | ||||
|
||||
| def _find_mismatched_keys( |
a-r-r-o-w marked this conversation as resolved.
Show resolved
Hide resolved
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -182,5 +182,14 @@ def get_device(): | |
| def empty_device_cache(device_type: Optional[str] = None): | ||
| if device_type is None: | ||
| device_type = get_device() | ||
| if device_type in ["cpu"]: | ||
| return | ||
| device_mod = getattr(torch, device_type, torch.cuda) | ||
| device_mod.empty_cache() | ||
|
|
||
|
|
||
| def device_synchronize(device_type: Optional[str] = None): | ||
| if device_type is None: | ||
| device_type = get_device() | ||
| device_mod = getattr(torch, device_type, torch.cuda) | ||
| device_mod.synchronize() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess all different backends ought to have this method. Just flagging. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. afaik, synchronize should be available on all devices. Just the empty_cache function required a special check because it would fail if device was cpu |
||
Uh oh!
There was an error while loading. Please reload this page.