We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 6c5a5db commit da5fd35Copy full SHA for da5fd35
lightllm/utils/envs_utils.py
@@ -137,6 +137,12 @@ def get_redundancy_expert_update_max_load_count():
137
return int(os.getenv("LIGHTLLM_REDUNDANCY_EXPERT_UPDATE_MAX_LOAD_COUNT", 1))
138
139
140
+# get_kv_quant_calibration_warmup_count 和 get_kv_quant_calibration_inference_count 是
141
+# 当模型以fp8 kv quant 的在线统计量化模式启动的时候使用的配置变量,用于在线校准fp8 kv 的scale
142
+# 校准完成后,保存为 .json 的配置文件,后续模型可以加载该配置文件,实现离线的fp8 kv 量化推理,
143
+# 提升 kv cache 对应的token容量。
144
+
145
146
@lru_cache(maxsize=None)
147
def get_kv_quant_calibration_warmup_count():
148
# 服务启动后前warmup次推理不计入量化校准统计
0 commit comments