Skip to content

Commit 92e1179

Browse files
authored
support quant_policy (#1556)
1 parent 3953be7 commit 92e1179

File tree

4 files changed

+11
-1
lines changed

4 files changed

+11
-1
lines changed

docs/source/LLM/命令行参数.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ RLHF参数继承了sft参数, 除此之外增加了以下参数:
321321
### lmdeploy 参数
322322
- `--tp`: tensor并行, 用于初始化lmdeploy引擎的参数, 默认值为`1`.
323323
- `--cache_max_entry_count`: 初始化lmdeploy引擎的参数, 默认值为`0.8`.
324+
- `--quant_policy`: Key-Value Cache量化, 初始化lmdeploy引擎的参数, 默认值为`0`, 你可以设置为4, 8.
324325
- `--vision_batch_size`: 初始化lmdeploy引擎的参数, 默认值为`1`. 该参数只有在使用多模态模型时生效.
325326

326327

docs/source_en/LLM/Command-line-parameters.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,7 @@ RLHF parameters are an extension of the sft parameters, with the addition of the
324324
### lmdeploy Parameters
325325
- `--tp`: Tensor parallelism, a parameter for initializing the lmdeploy engine, default value is `1`.
326326
- `--cache_max_entry_count`: Parameter to initialize the lmdeploy engine, default value is `0.8`.
327+
- `--quant_policy`: Quantization of Key-Value Cache, parameters for initializing the lmdeploy engine, default value is `0`, you can set it to 4 or 8.
327328
- `--vision_batch_size`: Parameter to initialize the lmdeploy engine, default value is `1`. This parameter is effective only when using multimodal models.
328329

329330
## export Parameters

swift/llm/utils/argument.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1243,6 +1243,7 @@ class InferArguments(ArgumentsBase):
12431243
# lmdeploy
12441244
tp: int = 1
12451245
cache_max_entry_count: float = 0.8
1246+
quant_policy: int = 0 # e.g. 4, 8
12461247
vision_batch_size: int = 1 # max_batch_size in VisionConfig
12471248

12481249
# compatibility. (Deprecated)

swift/llm/utils/lmdeploy_utils.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def get_lmdeploy_engine(
3636
revision: Optional[str] = None,
3737
tp: int = 1,
3838
cache_max_entry_count: float = 0.8,
39+
quant_policy: int = 0, # e.g. 4, 8
3940
vision_batch_size: int = 1, # max_batch_size in VisionConfig
4041
engine_kwargs: Optional[Dict[str, Any]] = None,
4142
**kwargs) -> Union[AsyncEngine, VLAsyncEngine]:
@@ -54,6 +55,7 @@ def get_lmdeploy_engine(
5455
engine_kwargs = {}
5556
engine_kwargs['tp'] = tp
5657
engine_kwargs['cache_max_entry_count'] = cache_max_entry_count
58+
engine_kwargs['quant_policy'] = quant_policy
5759

5860
backend_config = TurbomindEngineConfig(**engine_kwargs)
5961
backend_config = autoget_backend_config(model_dir, backend_config)
@@ -371,7 +373,12 @@ def prepare_lmdeploy_engine_template(args: InferArguments) -> Tuple[Union[AsyncE
371373
elif args.model_id_or_path is not None:
372374
model_id_or_path = args.model_id_or_path
373375
lmdeploy_engine = get_lmdeploy_engine(
374-
args.model_type, tp=args.tp, vision_batch_size=args.vision_batch_size, model_id_or_path=model_id_or_path)
376+
args.model_type,
377+
tp=args.tp,
378+
cache_max_entry_count=args.cache_max_entry_count,
379+
quant_policy=args.quant_policy,
380+
vision_batch_size=args.vision_batch_size,
381+
model_id_or_path=model_id_or_path)
375382
tokenizer = lmdeploy_engine.hf_tokenizer
376383

377384
if not args.do_sample:

0 commit comments

Comments
 (0)