support quant_policy (#1556)

Jintao-Huang · web-flow · commit 92e11791fe16 · 2024-07-31T17:43:23.000+08:00
diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -321,6 +321,7 @@ RLHF参数继承了sft参数, 除此之外增加了以下参数:
 ### lmdeploy 参数
 - `--tp`: tensor并行, 用于初始化lmdeploy引擎的参数, 默认值为`1`.
 - `--cache_max_entry_count`: 初始化lmdeploy引擎的参数, 默认值为`0.8`.
+- `--quant_policy`: Key-Value Cache量化, 初始化lmdeploy引擎的参数, 默认值为`0`, 你可以设置为4, 8.
 - `--vision_batch_size`: 初始化lmdeploy引擎的参数, 默认值为`1`. 该参数只有在使用多模态模型时生效.
 
 
diff --git a/docs/source_en/LLM/Command-line-parameters.md b/docs/source_en/LLM/Command-line-parameters.md
@@ -324,6 +324,7 @@ RLHF parameters are an extension of the sft parameters, with the addition of the
 ### lmdeploy Parameters
 - `--tp`: Tensor parallelism, a parameter for initializing the lmdeploy engine, default value is `1`.
 - `--cache_max_entry_count`: Parameter to initialize the lmdeploy engine, default value is `0.8`.
+- `--quant_policy`: Quantization of Key-Value Cache, parameters for initializing the lmdeploy engine, default value is `0`, you can set it to 4 or 8.
 - `--vision_batch_size`: Parameter to initialize the lmdeploy engine, default value is `1`. This parameter is effective only when using multimodal models.
 
 ## export Parameters
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -1243,6 +1243,7 @@ class InferArguments(ArgumentsBase):
     # lmdeploy
     tp: int = 1
     cache_max_entry_count: float = 0.8
+    quant_policy: int = 0  # e.g. 4, 8
     vision_batch_size: int = 1  # max_batch_size in VisionConfig
 
     # compatibility. (Deprecated)
diff --git a/swift/llm/utils/lmdeploy_utils.py b/swift/llm/utils/lmdeploy_utils.py
@@ -36,6 +36,7 @@ def get_lmdeploy_engine(
         revision: Optional[str] = None,
         tp: int = 1,
         cache_max_entry_count: float = 0.8,
+        quant_policy: int = 0,  # e.g. 4, 8
         vision_batch_size: int = 1,  # max_batch_size in VisionConfig
         engine_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs) -> Union[AsyncEngine, VLAsyncEngine]:
@@ -54,6 +55,7 @@ def get_lmdeploy_engine(
         engine_kwargs = {}
     engine_kwargs['tp'] = tp
     engine_kwargs['cache_max_entry_count'] = cache_max_entry_count
+    engine_kwargs['quant_policy'] = quant_policy
 
     backend_config = TurbomindEngineConfig(**engine_kwargs)
     backend_config = autoget_backend_config(model_dir, backend_config)
@@ -371,7 +373,12 @@ def prepare_lmdeploy_engine_template(args: InferArguments) -> Tuple[Union[AsyncE
     elif args.model_id_or_path is not None:
         model_id_or_path = args.model_id_or_path
     lmdeploy_engine = get_lmdeploy_engine(
-        args.model_type, tp=args.tp, vision_batch_size=args.vision_batch_size, model_id_or_path=model_id_or_path)
+        args.model_type,
+        tp=args.tp,
+        cache_max_entry_count=args.cache_max_entry_count,
+        quant_policy=args.quant_policy,
+        vision_batch_size=args.vision_batch_size,
+        model_id_or_path=model_id_or_path)
     tokenizer = lmdeploy_engine.hf_tokenizer
 
     if not args.do_sample: