Skip to content

Commit fce11b6

Browse files
committed
pich
1 parent f104ba8 commit fce11b6

File tree

4 files changed

+14
-3
lines changed

4 files changed

+14
-3
lines changed

gpt_server/model_worker/base/model_worker_base.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def get_worker(
203203
controller_addr: str = "http://localhost:21001",
204204
worker_id: str = str(uuid.uuid4())[:8],
205205
model_names: List[str] = [""],
206-
limit_worker_concurrency: int = 512,
206+
limit_worker_concurrency: int = 1024,
207207
conv_template: str = None, # type: ignore
208208
):
209209
worker = cls(
@@ -251,6 +251,8 @@ def run(cls):
251251
parser.add_argument("--log_level", type=str, default="WARNING")
252252
# task_type
253253
parser.add_argument("--task_type", type=str, default="auto")
254+
# limit_worker_concurrency
255+
parser.add_argument("--limit_worker_concurrency", type=int, default=1024)
254256
args = parser.parse_args()
255257
os.environ["num_gpus"] = str(args.num_gpus)
256258
if args.backend == "vllm":
@@ -279,6 +281,7 @@ def run(cls):
279281
os.environ["dtype"] = args.dtype
280282
os.environ["log_level"] = args.log_level
281283
os.environ["task_type"] = args.task_type
284+
limit_worker_concurrency = int(args.limit_worker_concurrency)
282285
logger.remove(0)
283286
log_level = os.getenv("log_level", "WARNING")
284287
logger.add(sys.stderr, level=log_level)
@@ -291,6 +294,8 @@ def run(cls):
291294
os.environ["WORKER_PORT"] = str(port)
292295
os.environ["WORKER_HOST"] = str(local_ip)
293296
worker_addr = f"http://{host}:{port}"
297+
model_names = args.model_names
298+
logger.info(f"{model_names[0]} args: \n{args}")
294299

295300
@app.on_event("startup")
296301
async def startup():
@@ -299,9 +304,10 @@ async def startup():
299304
worker = cls.get_worker(
300305
worker_addr=worker_addr,
301306
model_path=args.model_name_or_path,
302-
model_names=args.model_names,
307+
model_names=model_names,
303308
conv_template="chatglm3", # TODO 默认是chatglm3用于统一处理
304309
controller_addr=controller_address,
310+
limit_worker_concurrency=limit_worker_concurrency,
305311
)
306312

307313
uvicorn.run(app, host=host, port=port)

gpt_server/script/config_example.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ model_worker_args:
2222
host: 0.0.0.0
2323
controller_address: http://localhost:21001 # # 将模型注册到 控制器的 地址
2424
log_level: WARNING # DEBUG INFO WARNING ERROR
25+
limit_worker_concurrency: 1024 # worker的最大并发数,默认为 1024
2526

2627
models:
2728
- qwen:

gpt_server/serving/openai_api_server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -686,7 +686,7 @@ async def generate_completion_stream(payload: Dict[str, Any], worker_addr: str):
686686
worker_addr + "/worker_generate_stream",
687687
headers=headers,
688688
json=payload,
689-
timeout=30,
689+
timeout=60,
690690
) as response:
691691
# content = await response.aread()
692692
buffer = b""

gpt_server/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,9 @@ def start_model_worker(config: dict):
136136
host = config["model_worker_args"]["host"]
137137
controller_address = config["model_worker_args"]["controller_address"]
138138
log_level = config["model_worker_args"].get("log_level", "WARNING")
139+
limit_worker_concurrency = config["model_worker_args"].get(
140+
"limit_worker_concurrency", 1024
141+
)
139142
except KeyError as e:
140143
error_msg = f"请参照 https://github.com/shell-nlp/gpt_server/blob/main/gpt_server/script/config.yaml 设置正确的 model_worker_args"
141144
logger.error(error_msg)
@@ -254,6 +257,7 @@ def start_model_worker(config: dict):
254257
+ f" --kv_cache_quant_policy {kv_cache_quant_policy}" # kv cache 量化策略
255258
+ f" --log_level {log_level}" # 日志水平
256259
+ f" --task_type {task_type}" # 日志水平
260+
+ f" --limit_worker_concurrency {limit_worker_concurrency}" # 限制worker并发数
257261
)
258262
# 处理为 None的情况
259263
if lora:

0 commit comments

Comments
 (0)