diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 04e928216..3902163ef 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -95,12 +95,25 @@ jobs: uses: docker/build-push-action@ac9327eae2b366085ac7f6a2d02df8aa8ead720a with: context: . + file: ./docker/Dockerfile push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha cache-to: type=gha,mode=max + # Build and push specific Docker image for deepep + # https://github.com/docker/build-push-action + - name: Build and push deepep Docker image + id: build-and-push-deepep + uses: docker/build-push-action@ac9327eae2b366085ac7f6a2d02df8aa8ead720a + with: + context: . + file: ./docker/Dockerfile.deepep + push: ${{ github.event_name != 'pull_request' }} + tags: ghcr.io/modeltc/lightllm:main-deepep + cache-from: type=gha + cache-to: type=gha,mode=max # Sign the resulting Docker image digest except on PRs. # This will only write to the public Rekor transparency log when the Docker diff --git a/README.md b/README.md index 99bdeeb95..918aa3874 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
- LightLLM + LightLLM
@@ -29,8 +29,7 @@ LightLLM is a Python-based LLM (Large Language Model) inference and serving fram - [Install LightLLM](https://lightllm-en.readthedocs.io/en/latest/getting_started/installation.html) - [Quick Start](https://lightllm-en.readthedocs.io/en/latest/getting_started/quickstart.html) -- [LLM Service](https://lightllm-en.readthedocs.io/en/latest/models/test.html#llama) -- [VLM Service](https://lightllm-en.readthedocs.io/en/latest/models/test.html#llava) +- [TuTorial](https://lightllm-en.readthedocs.io/en/latest/tutorial/) ## Performance @@ -45,23 +44,8 @@ Please refer to the [FAQ](https://lightllm-en.readthedocs.io/en/latest/faq.html) We welcome any coopoeration and contribution. If there is a project requires LightLLM's support, please contact us via email or create a pull request. - -1.
LazyLLM: Easyest and lazyest way for building multi-agent LLMs applications. - - Once you have installed `lightllm` and `lazyllm`, and then you can use the following code to build your own chatbot: - - ~~~python - from lazyllm import TrainableModule, deploy, WebModule - # Model will be download automatically if you have an internet connection - m = TrainableModule('internlm2-chat-7b').deploy_method(deploy.lightllm) - WebModule(m).start().wait() - ~~~ - - Documents: https://lazyllm.readthedocs.io/ - -
- Projects based on LightLLM or referenced LightLLM components: +- [LazyLLM](https://github.com/LazyAGI/LazyLLM) - [LoongServe, Peking University](https://github.com/LoongServe/LoongServe) - [OmniKV, Ant Group](https://github.com/antgroup/OmniKV) - [vLLM](https://github.com/vllm-project/vllm) (some LightLLM's kernel used) diff --git a/assets/logo_new.png b/assets/logo_new.png new file mode 100644 index 000000000..5b3b63917 Binary files /dev/null and b/assets/logo_new.png differ diff --git a/build_and_upload_docker.sh b/build_and_upload_docker.sh index 0b1897316..fc7fd871f 100755 --- a/build_and_upload_docker.sh +++ b/build_and_upload_docker.sh @@ -17,5 +17,9 @@ fi IMAGE_TAG=$2 ACCOUNT=$1 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com -DOCKER_BUILDKIT=1 docker build -t $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG . +DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG . docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG + +#deepep +DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.deepep -t $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG-deepep . +docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG-deepep \ No newline at end of file diff --git a/Dockerfile b/docker/Dockerfile similarity index 100% rename from Dockerfile rename to docker/Dockerfile diff --git a/docker/Dockerfile.deepep b/docker/Dockerfile.deepep new file mode 100644 index 000000000..ce47ea5af --- /dev/null +++ b/docker/Dockerfile.deepep @@ -0,0 +1,81 @@ +FROM nvcr.io/nvidia/tritonserver:24.04-py3-min as base +ARG PYTORCH_VERSION=2.6.0 +ARG PYTHON_VERSION=3.9 +ARG CUDA_VERSION=12.4 +ARG MAMBA_VERSION=23.1.0-1 +ARG TARGETPLATFORM + +ENV PATH=/opt/conda/bin:$PATH \ + CONDA_PREFIX=/opt/conda + +RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + libssl-dev \ + curl \ + g++ \ + make \ + git && \ + rm -rf /var/lib/apt/lists/* + +ENV http_proxy=http://devsft:d0663c03baee@10.119.176.202:3128 +ENV https_proxy=http://devsft:d0663c03baee@10.119.176.202:3128 +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") MAMBA_ARCH=aarch64 ;; \ + *) MAMBA_ARCH=x86_64 ;; \ + esac && \ + curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \ + bash ~/mambaforge.sh -b -p /opt/conda && \ + rm ~/mambaforge.sh + +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") exit 1 ;; \ + *) /opt/conda/bin/conda update -y conda && \ + /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ + esac && \ + /opt/conda/bin/conda clean -ya + + +WORKDIR /root + +COPY ./requirements.txt /lightllm/requirements.txt +RUN pip install -r /lightllm/requirements.txt --no-cache-dir --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124 + +RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1 # for allreduce hang issues in multinode H100 + +RUN git clone https://github.com/Dao-AILab/flash-attention.git -b v2.7.4.post1 +RUN cd flash-attention/hopper && FLASH_ATTN_CUDA_ARCHS=90 NVCC_THREADS=128 python setup.py install + +RUN git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git +RUN cd DeepGEMM && python setup.py install + +WORKDIR /root +RUN git clone https://github.com/deepseek-ai/DeepEP.git + +# NVSHMEM +RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz +RUN tar -xf nvshmem_src_3.2.5-1.txz \ + && mv nvshmem_src nvshmem + +WORKDIR /root/nvshmem +RUN git apply /root/DeepEP/third-party/nvshmem.patch + +WORKDIR /root/nvshmem +ENV CUDA_HOME=/usr/local/cuda +RUN NVSHMEM_SHMEM_SUPPORT=0 \ + NVSHMEM_UCX_SUPPORT=0 \ + NVSHMEM_USE_NCCL=0 \ + NVSHMEM_MPI_SUPPORT=0 \ + NVSHMEM_IBGDA_SUPPORT=1 \ + NVSHMEM_PMIX_SUPPORT=0 \ + NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ + NVSHMEM_USE_GDRCOPY=1 \ + cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 -DMLX5_lib=/usr/lib/x86_64-linux-gnu/libmlx5.so.1 \ + && cd build \ + && make install -j64 + +WORKDIR /root/DeepEP +ENV NVSHMEM_DIR=/root/nvshmem/install +RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install + +COPY . /lightllm +RUN pip install -e /lightllm --no-cache-dir \ No newline at end of file diff --git a/docs/CN/source/_static/openapi.json b/docs/CN/source/_static/openapi.json deleted file mode 100755 index d591ecf0a..000000000 --- a/docs/CN/source/_static/openapi.json +++ /dev/null @@ -1,536 +0,0 @@ -{ - "openapi": "3.0.2", - "info": { - "title": "FastAPI", - "version": "0.1.0" - }, - "paths": { - "/liveness": { - "get": { - "summary": "Liveness", - "operationId": "liveness_liveness_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - }, - "post": { - "summary": "Liveness", - "operationId": "liveness_liveness_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/readiness": { - "get": { - "summary": "Readiness", - "operationId": "readiness_readiness_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - }, - "post": { - "summary": "Readiness", - "operationId": "readiness_readiness_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/health": { - "get": { - "summary": "Check server health", - "operationId": "healthcheck_health_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - }, - "head": { - "summary": "Check server health", - "operationId": "healthcheck_health_head", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/healthz": { - "get": { - "summary": "Check server health", - "operationId": "healthcheck_healthz_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/token_load": { - "get": { - "summary": "Get the current server's load on tokens", - "operationId": "token_load_token_load_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/generate": { - "post": { - "summary": "Generate", - "operationId": "generate_generate_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/generate_stream": { - "post": { - "summary": "Generate Stream", - "operationId": "generate_stream_generate_stream_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/get_score": { - "post": { - "summary": "Get Score", - "operationId": "get_score_get_score_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/": { - "post": { - "summary": "Compat Generate", - "operationId": "compat_generate__post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/v1/chat/completions": { - "post": { - "summary": "Chat Completions", - "operationId": "chat_completions_v1_chat_completions_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ChatCompletionRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ChatCompletionResponse" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } - } - } - }, - "/tokens": { - "get": { - "summary": "Tokens", - "operationId": "tokens_tokens_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - }, - "post": { - "summary": "Tokens", - "operationId": "tokens_tokens_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/metrics": { - "get": { - "summary": "Metrics", - "operationId": "metrics_metrics_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - } - }, - "components": { - "schemas": { - "ChatCompletionRequest": { - "title": "ChatCompletionRequest", - "required": [ - "model", - "messages" - ], - "type": "object", - "properties": { - "model": { - "title": "Model", - "type": "string" - }, - "messages": { - "title": "Messages", - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "type": "string" - } - } - }, - "function_call": { - "title": "Function Call", - "type": "string", - "default": "none" - }, - "temperature": { - "title": "Temperature", - "type": "number", - "default": 1 - }, - "top_p": { - "title": "Top P", - "type": "number", - "default": 1.0 - }, - "n": { - "title": "N", - "type": "integer", - "default": 1 - }, - "stream": { - "title": "Stream", - "type": "boolean", - "default": false - }, - "stop": { - "title": "Stop", - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "max_tokens": { - "title": "Max Tokens", - "type": "integer", - "default": 16 - }, - "presence_penalty": { - "title": "Presence Penalty", - "type": "number", - "default": 0.0 - }, - "frequency_penalty": { - "title": "Frequency Penalty", - "type": "number", - "default": 0.0 - }, - "logit_bias": { - "title": "Logit Bias", - "type": "object", - "additionalProperties": { - "type": "number" - } - }, - "user": { - "title": "User", - "type": "string" - }, - "do_sample": { - "title": "Do Sample", - "type": "boolean", - "default": false - }, - "top_k": { - "title": "Top K", - "type": "integer", - "default": -1 - }, - "ignore_eos": { - "title": "Ignore Eos", - "type": "boolean", - "default": false - } - } - }, - "ChatCompletionResponse": { - "title": "ChatCompletionResponse", - "required": [ - "model", - "choices", - "usage" - ], - "type": "object", - "properties": { - "id": { - "title": "Id", - "type": "string" - }, - "object": { - "title": "Object", - "type": "string", - "default": "chat.completion" - }, - "created": { - "title": "Created", - "type": "integer" - }, - "model": { - "title": "Model", - "type": "string" - }, - "choices": { - "title": "Choices", - "type": "array", - "items": { - "$ref": "#/components/schemas/ChatCompletionResponseChoice" - } - }, - "usage": { - "$ref": "#/components/schemas/UsageInfo" - } - } - }, - "ChatCompletionResponseChoice": { - "title": "ChatCompletionResponseChoice", - "required": [ - "index", - "message" - ], - "type": "object", - "properties": { - "index": { - "title": "Index", - "type": "integer" - }, - "message": { - "$ref": "#/components/schemas/ChatMessage" - }, - "finish_reason": { - "title": "Finish Reason", - "enum": [ - "stop", - "length", - "function_call" - ], - "type": "string" - } - } - }, - "ChatMessage": { - "title": "ChatMessage", - "required": [ - "role", - "content" - ], - "type": "object", - "properties": { - "role": { - "title": "Role", - "type": "string" - }, - "content": { - "title": "Content", - "type": "string" - } - } - }, - "HTTPValidationError": { - "title": "HTTPValidationError", - "type": "object", - "properties": { - "detail": { - "title": "Detail", - "type": "array", - "items": { - "$ref": "#/components/schemas/ValidationError" - } - } - } - }, - "UsageInfo": { - "title": "UsageInfo", - "type": "object", - "properties": { - "prompt_tokens": { - "title": "Prompt Tokens", - "type": "integer", - "default": 0 - }, - "completion_tokens": { - "title": "Completion Tokens", - "type": "integer", - "default": 0 - }, - "total_tokens": { - "title": "Total Tokens", - "type": "integer", - "default": 0 - } - } - }, - "ValidationError": { - "title": "ValidationError", - "required": [ - "loc", - "msg", - "type" - ], - "type": "object", - "properties": { - "loc": { - "title": "Location", - "type": "array", - "items": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "integer" - } - ] - } - }, - "msg": { - "title": "Message", - "type": "string" - }, - "type": { - "title": "Error Type", - "type": "string" - } - } - } - } - } -} \ No newline at end of file diff --git a/docs/CN/source/assets/logos/lightllm-logo.png b/docs/CN/source/assets/logos/lightllm-logo.png index 1a9794bf8..5b3b63917 100755 Binary files a/docs/CN/source/assets/logos/lightllm-logo.png and b/docs/CN/source/assets/logos/lightllm-logo.png differ diff --git a/docs/CN/source/lightllm/lightllm_impl.rst b/docs/CN/source/framework/framework.rst old mode 100755 new mode 100644 similarity index 99% rename from docs/CN/source/lightllm/lightllm_impl.rst rename to docs/CN/source/framework/framework.rst index 275828033..a7bbb3fec --- a/docs/CN/source/lightllm/lightllm_impl.rst +++ b/docs/CN/source/framework/framework.rst @@ -1,4 +1,4 @@ -Lightllm 框架 +Lightllm 架构介绍 ========================== lightllm 的设计核心是多进程协作,每个进程负责一个模块,通过zmq和rpc的方式进行多进程协同工作。 diff --git a/docs/CN/source/dev/router.rst b/docs/CN/source/framework/router.rst similarity index 100% rename from docs/CN/source/dev/router.rst rename to docs/CN/source/framework/router.rst diff --git a/docs/CN/source/dev/token_attention.rst b/docs/CN/source/framework/token_attention.rst similarity index 100% rename from docs/CN/source/dev/token_attention.rst rename to docs/CN/source/framework/token_attention.rst diff --git a/docs/CN/source/getting_started/benchmark.rst b/docs/CN/source/getting_started/benchmark.rst new file mode 100644 index 000000000..c9fc778aa --- /dev/null +++ b/docs/CN/source/getting_started/benchmark.rst @@ -0,0 +1,218 @@ +Benchmark 测试指南 +================== + +LightLLM 提供了全面的性能测试工具,包括服务端性能测试和静态推理性能测试。本文档将详细介绍如何使用这些工具进行性能评估。 + +服务端性能测试 (Service Benchmark) +--------------------------------- + +服务端性能测试主要用于评估 LightLLM 在真实服务场景下的性能表现,包括吞吐量、延迟等关键指标。 + +QPS 测试 (benchmark_qps.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +QPS (Queries Per Second) 测试是评估服务端性能的核心工具,支持 LightLLM 和 OpenAI 兼容的 API 格式 + +**使用方法:** + +.. code-block:: bash + + python test/benchmark/service/benchmark_qps.py \ + --url http://127.0.0.1:8000/generate_stream \ + --tokenizer_path /path/to/tokenizer \ + --num_clients 100 \ + --input_num 2000 \ + --input_qps 30.0 \ + --input_len 1024 \ + --output_len 128 \ + --server_api lightllm \ + --dump_file results.json + +**主要参数说明:** + +- ``--url``: 服务端地址,支持 LightLLM 和 OpenAI 格式 +- ``--tokenizer_path``: 分词器路径 +- ``--input_num``: 测试请求总数 +- ``--input_qps``: 输入 QPS 限制 +- ``--input_len``: 输入序列长度 +- ``--output_len``: 输出序列长度 +- ``--server_api``: 服务端 API 类型 (lightllm/openai) +- ``--data_path``: 自定义数据集路径 +- ``--continuous_send``: 是否连续发送 (0/1) +- ``--force_terminate``: 强制终止模式 (0/1) + +**输出指标:** + +- Total QPS: 总体每秒查询数 +- Sender QPS: 发送端 QPS +- Avg Input Length: 平均输入长度 +- Avg Output Length: 平均输出长度 +- Total Throughput: 总体吞吐量 (token/s) +- Input Throughput: 输入吞吐量 (token/s) +- Output Throughput: 输出吞吐量 (token/s) +- request_time P{25,50,75,90,95,99,100}: 请求延迟百分位数 +- first_token_time P{25,50,75,90,95,99,100}: 首 token 延迟百分位数 +- decode_token_time P{25,50,75,90,95,99,100}: 解码 token 延迟百分位数 + +固定并发测试 (benchmark_client.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +用于评估不同客户端并发数下的性能表现。 + +**使用方法:** + +.. code-block:: bash + + python test/benchmark/service/benchmark_client.py \ + --url http://127.0.0.1:8000/generate_stream \ + --tokenizer_path /path/to/tokenizer \ + --num_clients 100 \ + --input_num 2000 \ + --input_len 1024 \ + --output_len 128 \ + --server_api lightllm + +ShareGPT 数据集测试 (benchmark_sharegpt.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +使用 ShareGPT 真实对话数据进行性能测试。 + +**使用方法:** + +.. code-block:: bash + + $ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + +.. code-block:: bash + + python test/benchmark/service/benchmark_sharegpt.py \ + --dataset /path/to/sharegpt_dataset.json \ + --tokenizer /path/to/tokenizer \ + --num_prompts 1000 \ + --request_rate 10.0 + +**主要参数:** + +- ``--dataset``: ShareGPT 格式数据集路径 +- ``--tokenizer``: 分词器路径 +- ``--num_prompts``: 测试提示数量 +- ``--request_rate``: 请求速率 (requests/s) + + +Prompt Cache 测试 +~~~~~~~~~~~~~~~~~ + +评估不同命中率下,prompt cache 的性能,通过调整 --first_input_len, --output_len --subsequent_input_len 来控制命中率。 +每轮命中率 = (first_input_len + (output_len + subsequent_input_len) * (num_turns - 1)) / (first_input_len + (output_len + subsequent_input_len) * num_turns) +注意要根据最大token容量控制并发数和users数,确保能够放下所有请求,保障其实际命中率和自己预设的命中率一致。 + +.. code-block:: bash + + python test/benchmark/service/benchmark_prompt_cache.py \ + --model_url http://127.0.0.1:8000/generate_stream \ + --model_name model \ + --num_workers 10 \ + --first_input_len 512 \ + --subsequent_input_len 512 \ + --output_len 128 \ + --num_turns 10 \ + --num_users 10 + +参数说明: + +- ``--model_url``: 服务地址 +- ``--model_name``: 结果保存文件名 +- ``--num_workers``: 并发数 +- ``--first_input_len``: 第一轮输入长度 +- ``--subsequent_input_len``: 后续轮输入长度 +- ``--output_len``: 输出长度 +- ``--num_turns``: 轮数 +- ``--num_users``: 用户数 + +静态推理性能测试 (Static Inference Benchmark) +-------------------------------------------- + +静态推理测试用于评估模型在固定输入条件下的推理性能, 主要评估算子的优劣 +模型推理测试 (model_infer.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**主要特性:** + +- 支持 prefill 和 decode 阶段性能测试 +- 支持 microbatch overlap 优化 +- 支持多 GPU 并行推理 +- 提供详细的吞吐量统计 + +**使用方法:** + +.. code-block:: bash + + python test/benchmark/static_inference/test_model.py \ + --model_dir /path/to/model \ + --batch_size 32 \ + --input_len 1024 \ + --output_len 128 \ + --tp 2 \ + --data_type bf16 + +**主要参数:** + +- ``--model_dir``: 模型路径 +- ``--batch_size``: 批次大小 +- ``--input_len``: 输入序列长度 +- ``--output_len``: 输出序列长度 +- ``--tp``: Tensor Parallel 并行度 +- ``--data_type``: 数据类型 (bf16/fp16/fp32) +- ``--enable_prefill_microbatch_overlap``: 启用 prefill microbatch overlap,仅适用于DeepSeek模型的EP模式 +- ``--enable_decode_microbatch_overlap``: 启用 decode microbatch overlap,仅适用于DeepSeek模型的EP模式 +- ``--torch_profile``: 启用 torch profiler 进行性能分析 + +.. note:: + 这里没有列举完整的启动参数,静态测试脚本也共享lightllm的启动参数,更多启动配置可以参考 :ref:`tutorial/api_server_args_zh` 。 + +**输出指标:** + +- Prefill 阶段吞吐量 (tokens/s) +- Decode 阶段吞吐量 (tokens/s) +- 各阶段延迟统计 + +多结果预测性能测试 (model_infer_mtp.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +多结果预测静态性能测试,默认百分百接受率,用来评估多结果预测的极限性能。目前只支持DeepSeek 系列模型 + +**使用方法:** + +.. code-block:: bash + + python test/benchmark/static_inference/test_model.py \ + --model_dir /path/to/main_model \ + --mtp_mode deepseekv3 \ + --mtp_step 1 \ + --mtp_draft_model_dir /path/to/draft_model \ + --batch_size 32 \ + --input_len 1024 \ + --output_len 128 + +参数说明: + +- ``--model_dir``: 主模型路径 +- ``--mtp_mode``: 指定多结果预测的模型,目前只支持deepseekv2/v3/r1 +- ``--mtp_step``: 每次forward step产生的token 数量,默认为1 +- ``--mtp_draft_model_dir``: 草稿模型路径 + +Vision Transformer 测试 (test_vit.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +用于测试 Vision Transformer 模型的性能。 + +**使用方法:** + +.. code-block:: bash + + python test/benchmark/static_inference/test_vit.py \ + --model_dir ./InternVL2/InternVL2-8B/ \ + --batch_size 1 \ + --image_size 448 \ + --world_size 2 \ No newline at end of file diff --git a/docs/CN/source/getting_started/installation.rst b/docs/CN/source/getting_started/installation.rst index d70707a37..cbf5dc339 100755 --- a/docs/CN/source/getting_started/installation.rst +++ b/docs/CN/source/getting_started/installation.rst @@ -48,7 +48,8 @@ Lightllm 是一个纯python开发的推理框架,其中的算子使用triton $ python tools/quick_launch_docker.py --help .. note:: - 如果你使用多卡,你也许需要提高上面的 –shm_size 的参数设置。 + 如果你使用多卡,你也许需要提高上面的 –shm_size 的参数设置。如果需要跑DeepSeek模型的EP模式,请使用镜像 + ghcr.io/modeltc/lightllm:main-deepep。 .. _build_from_source: diff --git a/docs/CN/source/getting_started/quickstart.rst b/docs/CN/source/getting_started/quickstart.rst index b22cf619d..adad90865 100755 --- a/docs/CN/source/getting_started/quickstart.rst +++ b/docs/CN/source/getting_started/quickstart.rst @@ -17,7 +17,7 @@ 1. 准备模型文件 ------------------------- -下面的内容将会以 `Llama-2-7b-chat `_ 演示lightllm对大语言模型的支持。 +下面的内容将会以 `Qwen3-8B `_ 演示lightllm对大语言模型的支持。 下载模型的方法可以参考文章:`如何快速下载huggingface模型——全方法总结 `_ 下面是下载模型的实例代码: @@ -38,230 +38,32 @@ .. code-block:: console - $ huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir Llama-2-7b-chat - -.. tip:: - 上面的下载模型的代码需要科学上网,并且需要花费一定的时间,你可以使用其它下载方式或者其它支持的模型作为替代。最新的支持的模型的列表请查看 `项目主页 `_ 。 + $ huggingface-cli download Qwen/Qwen3-8B --local-dir Qwen3-8B 2. 启动模型服务 ------------------------- -下载完Llama-2-7b-chat模型以后,在终端使用下面的代码部署API服务: +下载完Qwen3-8B模型以后,在终端使用下面的代码部署API服务: .. code-block:: console - $ python -m lightllm.server.api_server --model_dir ~/models/Llama-2-7b-chat + $ python -m lightllm.server.api_server --model_dir ~/models/Qwen3-8B .. note:: 上面代码中的 ``--model_dir`` 参数需要修改为你本机实际的模型路径。 -单机H200部署 DeepSeek-R1 模型, 启动命令如下: - -.. code-block:: console - - $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 8 --graph_max_batch_size 100 - -.. note:: - LOADWORKER 指定了模型加载的线程,可以提高模型加载的速度。--graph_max_batch_size 指定了要捕获的cudagraph的数量,将捕获从1到100的batch size的图。 - -双机H100部署 DeepSeek-R1 模型,启动命令如下: - -.. code-block:: console - - $ # Node 0 - $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 16 --graph_max_batch_size 100 --nccl_host master_addr --nnodes 2 --node_rank 0 - $ # Node 1 - $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 16 --graph_max_batch_size 100 --nccl_host master_addr --nnodes 2 --node_rank 1 - -3. PD 分离启动模型服务 -------------------------- -查找本机IP - -.. code-block:: console - - $ hostname -i - -运行MPS(可选, 有mps支持性能会好特别多,但是部分显卡和驱动环境开启mps会容易出现错误,建议升级驱动到较高版本,特别是H系列卡) - -.. code-block:: console - - $ nvidia-cuda-mps-control -d - - -运行pd_master服务 - -.. code-block:: console - - $ python -m lightllm.server.api_server \ - $ --model_dir /your/model/path \ - $ --run_mode "pd_master" \ - $ --host /your/host/ip \ - $ --port 60011 - -新建终端,运行prefill服务 - -.. code-block:: console - - $ CUDA_VISIBLE_DEVICES=0,1 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /data/fengdahu/model/Qwen2-7B/ \ - $ --run_mode "prefill" \ - $ --host /your/host/ip \ - $ --port 8017 \ - $ --tp 2 \ - $ --nccl_port 2732 \ - $ --max_total_token_num 400000 \ - $ --tokenizer_mode fast \ - $ --pd_master_ip /your/host/ip \ - $ --pd_master_port 60011 \ - $ --max_req_total_len 16000 \ - $ --running_max_req_size 128 \ - $ --disable_cudagraph - -新建终端,运行decoding服务 - -.. code-block:: console - - $ CUDA_VISIBLE_DEVICES=2,3 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /data/fengdahu/model/Qwen2-7B/ \ - $ --run_mode "decode" \ - $ --host /your/host/ip \ - $ --port 8118 \ - $ --nccl_port 12322 \ - $ --tp 2 \ - $ --max_total_token_num 400000 \ - $ --graph_max_len_in_batch 2048 \ - $ --graph_max_batch_size 16 \ - $ --tokenizer_mode fast \ - $ --pd_master_ip /your/host/ip \ - $ --pd_master_port 60011 - -.. note:: - prefill和decoding阶段的tp大小保持一致, 目前可以支持 prefill 和 decode 节点的数量是变化的,同时prefill 和 decode可以跨机部署。 - - -4. (可选)测试模型服务 -------------------------- - -在新的终端,使用下面的指令对模型服务进行测试: - -.. code-block:: console - - $ curl http://server_ip:server_port/generate \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "inputs": "What is AI?", - $ "parameters":{ - $ "max_new_tokens":17, - $ "frequency_penalty":1 - $ } - $ }' - - -对于DeepSeek-R1模型,可以用如下脚本进行测试: - -.. code-block:: console - - $ cd test - $ python benchmark_client.py --num_clients 100 --input_num 2000 --tokenizer_path /nvme/DeepSeek-R1/ --url http://127.0.01:8000/generate_stream - - -3. PD 分离多PD_Master节点类型启动模型服务 -------------------------- -查找本机IP - -.. code-block:: console - - $ hostname -i - -运行MPS(可选, 有mps支持性能会好特别多,但是部分显卡和驱动环境开启mps会容易出现错误,建议升级驱动到较高版本,特别是H系列卡) - -.. code-block:: console - - $ nvidia-cuda-mps-control -d - - -运行config_server服务 -.. code-block:: console - -$ python -m lightllm.server.api_server \ -$ --run_mode "config_server" \ -$ --config_server_host /your/host/ip \ -$ --config_server_port 60088 \ - - -运行pd_master服务, 在多pd_master节点模式下,可以开启多个pd_master服务,来实现负载均衡,单个pd_master因为python gil锁的原因 -其并发性能存在上限。 - -.. code-block:: console - - $ python -m lightllm.server.api_server \ - $ --model_dir /your/model/path \ - $ --run_mode "pd_master" \ - $ --host /your/host/ip \ - $ --port 60011 \ - $ --config_server_host \ - $ --config_server_port - -新建终端,运行prefill服务 - -.. code-block:: console - - $ CUDA_VISIBLE_DEVICES=0,1 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /data/fengdahu/model/Qwen2-7B/ \ - $ --run_mode "prefill" \ - $ --host /your/host/ip \ - $ --port 8017 \ - $ --tp 2 \ - $ --nccl_port 2732 \ - $ --max_total_token_num 400000 \ - $ --tokenizer_mode fast \ - $ --max_req_total_len 16000 \ - $ --running_max_req_size 128 \ - $ --disable_cudagraph \ - $ --config_server_host \ - $ --config_server_port - -新建终端,运行decoding服务 - -.. code-block:: console - - $ CUDA_VISIBLE_DEVICES=2,3 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /data/fengdahu/model/Qwen2-7B/ \ - $ --run_mode "decode" \ - $ --host /your/host/ip \ - $ --port 8118 \ - $ --nccl_port 12322 \ - $ --tp 2 \ - $ --max_total_token_num 400000 \ - $ --graph_max_len_in_batch 2048 \ - $ --graph_max_batch_size 16 \ - $ --tokenizer_mode fast \ - $ --config_server_host \ - $ --config_server_port - -.. note:: - prefill和decoding阶段的tp大小保持一致, 目前可以支持 prefill 和 decode 节点的数量是变化的,同时prefill 和 decode可以跨机部署。 - - -4. (可选)测试模型服务 +3. 测试模型服务 ------------------------- -在新的终端,使用下面的指令对模型服务进行测试, 在多pd_master模式下,每个pd_master都可以作为访问入口: - -.. code-block:: console - - $ curl http://server_ip:server_port/generate \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "inputs": "What is AI?", - $ "parameters":{ - $ "max_new_tokens":17, - $ "frequency_penalty":1 - $ } - $ }' - - -对于DeepSeek-R1模型,可以用如下脚本进行测试: - .. code-block:: console - $ cd test - $ python benchmark_client.py --num_clients 100 --input_num 2000 --tokenizer_path /nvme/DeepSeek-R1/ --url http://127.0.01:8000/generate_stream - + $ curl http://127.0.0.1:8000/generate \ + -H "Content-Type: application/json" \ + -d '{ + "inputs": "What is AI?", + "parameters":{ + "max_new_tokens":17, + "frequency_penalty":1 + } + }' \ No newline at end of file diff --git a/docs/CN/source/index.rst b/docs/CN/source/index.rst index d4b548b6d..b2fa01573 100755 --- a/docs/CN/source/index.rst +++ b/docs/CN/source/index.rst @@ -23,46 +23,15 @@ Lightllm 是一个纯python开发的大语言模型推理和服务框架,具有轻量级设计、易扩展以及高性能等特点。 -Lightllm 整合了众多的开源方案的优点,包括但不限于 FasterTransformer、TGI、vLLM 和 FlashAttention。 +Lightllm 整合了众多的开源方案的优点,包括但不限于 FasterTransformer、TGI、vLLM、SGLang 和 FlashAttention。 **重要特性**: -* 多进程协同:分词、语言模型推理、视觉模型推理、分词等工作异步进行,大幅提高GPU利用率。 -* 零填充:提供对跨多个模型的 nopad-Attention 计算的支持,以有效处理长度差异较大的请求。 -* 动态批处理:能够对请求进行动态的批处理调度。 -* FlashAttention:结合 FlashAttention 来提高推理过程中的速度并减少 GPU 内存占用。 -* 向量并行:利用多个 GPU 进行张量并行性从而加快推理速度。 -* **Token Attention**:实现了以token为单位的KV缓存内存管理机制,实现推理过程中内存零浪费。 -* 高性能路由:结合Token Attention,对GPU内存以token为单位进行精致管理,优化系统吞吐量。 -* int8 KV Cache:该功能可以将最大token量提升解决两倍。现在只支持llama架构的模型。 - -**支持的模型列表**: - -- `BLOOM `_ -- `LLaMA `_ -- `LLaMA V2 `_ -- `StarCoder `_ -- `Qwen-7b `_ -- `ChatGLM2-6b `_ -- `Baichuan-7b `_ -- `Baichuan2-7b `_ -- `Baichuan2-13b `_ -- `Baichuan-13b `_ -- `InternLM-7b `_ -- `Yi-34b `_ -- `Qwen-VL `_ -- `Qwen-VL-Chat `_ -- `Llava-7b `_ -- `Llava-13b `_ -- `Mixtral `_ -- `Stablelm `_ -- `MiniCPM `_ -- `Phi-3 `_ -- `CohereForAI `_ -- `DeepSeek-V2-Lite `_ -- `DeepSeek-V2 `_ - +* 多进程协同:输入文本编码、语言模型推理、视觉模型推理、输出解码等工作异步进行,大幅提高GPU利用率。 +* 跨进程请求对象共享:通过共享内存,实现跨进程请求对象共享,降低进程间通信延迟。 +* 高效的调度策略:带预测的峰值显存调度策略,最大化GPU显存利用率的同时,降低请求逐出。 +* 高性能的推理后端:高效的算子实现,多种并行方式支持(张量并行,数据并行以及专家并行),动态kv缓存,丰富的量化支持(int8, fp8, int4),结构化输出以及多结果预测。 文档列表 ------------- @@ -73,45 +42,33 @@ Lightllm 整合了众多的开源方案的优点,包括但不限于 FasterTran 安装指南 快速开始 - + 性能评测 .. toctree:: :maxdepth: 1 - :caption: Lightllm - - lightllm/lightllm_intro - lightllm/lightllm_impl + :caption: 部署教程 + + DeepSeek R1 部署 + 多模态部署 + 奖励模型部署 + OpenAI 接口使用 + APIServer 参数详解 + lightllm api介绍 .. toctree:: :maxdepth: 1 - :caption: 模型 + :caption: 模型支持 支持的模型列表 - 启动和测试模型示例 添加新模型 - -.. toctree:: - :maxdepth: 1 - :caption: 启动服务 - - 启动参数说明 - 服务性能评测 - - -.. toctree:: - :maxdepth: 1 - :caption: 使用服务 - - user/api_param - user/openapi_docs - .. toctree:: :maxdepth: 1 - :caption: 开发者文档 + :caption: 架构介绍 - dev/token_attention - dev/router + 架构介绍 + token attention介绍 + 峰值显存调度器介绍 .. Indices and tables .. ================== diff --git a/docs/CN/source/lightllm/lightllm_intro.rst b/docs/CN/source/lightllm/lightllm_intro.rst deleted file mode 100644 index 315c82cbd..000000000 --- a/docs/CN/source/lightllm/lightllm_intro.rst +++ /dev/null @@ -1,97 +0,0 @@ -.. _lightllm: - -LightLLM 介绍 -================ - -随着ChatGPT的流行,大语言模型(简称LLM)受到越来越多的关注。此类模式的出现,极大地提高了人们的工作效率。 -然而,进一步广泛使用LLM的关键在于如何以低成本和高吞吐量地部署数十亿参数的模型。 -为了提高大模型服务的吞吐量,让更多感兴趣的研究人员快速参与进来, -一种名为 LightLLM 的轻量级 LLM 推理服务框架应运而生。 -LightLLM 引入了一种更细粒度的kvCache管理算法,称为TokenAttention, -并设计了一个与TokenAttention高效配合的Efficient Router调度算法。 -通过 TokenAttention 和 Efficient Router 的配合, -LightLLM 在大多数场景下实现了比 vLLM 和 Text Generation Inference 更高的吞吐量, -甚至在某些情况下性能提升了 4 倍左右。 LightLLM 灵活、用户友好且高效, -欢迎有兴趣的朋友进入 `项目主页 `_ 了解更多。 - - -.. _challenge: - -LLM 推理服务的挑战 ------------------- - -大型语言模型由于其优异的性能而引起了研究人员的极大关注。 -这些模型不仅可以与人类进行日常对话,还可以帮助完成各种日常任务,从而提高生产力。 -然而,尽管这些模型表现出了出色的性能,但提高部署大规模模型的性能仍面临以下挑战: - -* **内存碎片严重**:从几十到几百G不等的网络权重,以及推理过程中不断动态增长的KV Cache,容易导致大量的内存碎片,进而导致内存利用率低。 -* **请求调度效率低**:请求的长度随时间动态变化,可能导致GPU空闲或利用率低的问题。 -* **内核定制难度大**:为了高效利用内存、提高服务吞吐量,需要针对网络定制内核。然而,这需要研究人员付出大量的努力。 - - -.. _solutions_and_problems: - -现有的解决方案和存在问题 ------------------------------ - -为了应对上述挑战,许多优秀的LLM推理框架应运而生, -例如FasterTransformer、Text-Generation-Inference(简称TGI)、vLLM等。这些框架的核心特性和能力如下表所示: - - -.. list-table:: 各个框架对比 - :header-rows: 1 - - * - 框架 - - NV Triton + FasterTransformer - - TGI - - vLLM - - LightLLM - * - 核心特征 - - 高效算子 - - `Continuous batch `_, Token streaming - - `PageAttention `_ - - 三进程异步协同, `Token Attention `_, Efficient Router - * - 内存碎片 - - 少 - - 多 - - 少 - - 少 - * - 请求的调度效率 - - 低 - - 中 - - 中 - - 高 - * - 定制化算子的难度 - - 高 - - 中 - - 中 - - 低 - -这些框架都有自己独特的特点。 -例如,FasterTransformer具有优异的静态推理性能,但缺乏健壮的服务调度,并且主要采用C++开发,导致二次开发成本较高。 -TGI具有优秀的服务接口和Continuation Batch等调度特性,但其推理性能、调度策略、内存管理等方面存在一些不足。 -vLLM具有出色的内存管理能力,但在请求调度方面缺乏效率,其整体实现细节更适合部署小型模型。 - - -Lightllm ----------------------- - -因此,为了解决这些问题,我们开发了一个名为LightLLM的LLM部署框架,它是基于纯Python语言的。 -它使研究人员能够在本地轻松部署和定制轻量级模型,从而可以快速扩展不同模型并集成各种优秀的开源功能。 -LightLLM的核心特点如下: - -* 三进程异步协作:分词、模型推理、去分词异步进行,GPU利用率大幅提升。 -* :ref:`TokenAttention`:实现token-wise的KV缓存内存管理机制,实现推理时内存零浪费。 -* :ref:`Efficient_Router`:与Token Attention合作,精心管理每个Token的GPU内存,从而优化系统吞吐量。 - -凭借基于OpenAI Triton开发的高度协调的高效内核和服务调度,LightLLM实现了优异的吞吐性能。 - -.. figure:: ../assets/lightllm/arch.png - :width: 100% - :align: center - :alt: Lightllm - :class: no-scaled-link - - - -LightLLM致力于让更多人参与进来,从而灵活高效地探索各种LLM部署和推理解决方案。也为硬件厂商推动该领域的发展提供参考。我们希望大家能够给它更多的star,fork这个项目,并做出贡献。我们相信未来将会出现更多的技术和解决方案(如TensorRT),不断降低部署成本,让AGI更容易走进普通家庭。 \ No newline at end of file diff --git a/docs/CN/source/models/supported_models.rst b/docs/CN/source/models/supported_models.rst index 5971149f8..8f567899d 100755 --- a/docs/CN/source/models/supported_models.rst +++ b/docs/CN/source/models/supported_models.rst @@ -92,4 +92,5 @@ Reward模型 - 备注 * - `internLM-reward `_ - :code:`--use_reward_model` - + * - `Qwen2-Reward `_ + - :code:`--use_reward_model` \ No newline at end of file diff --git a/docs/CN/source/models/test.rst b/docs/CN/source/models/test.rst deleted file mode 100755 index 572b01f2f..000000000 --- a/docs/CN/source/models/test.rst +++ /dev/null @@ -1,196 +0,0 @@ -启动和测试模型示例 -==================== - -Qwen2-0.5B -^^^^^^^^^^^^^^^^^^^^^ - -**启动服务** - -.. code-block:: console - - $ python -m lightllm.server.api_server --model_dir ~/models/Qwen2-0.5B --trust_remote_code - -**测试服务** - - -.. code-block:: console - - $ curl http://localhost:8000/generate \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "inputs": "What is AI?", - $ "parameters":{ - $ "max_new_tokens":17, - $ "frequency_penalty":1 - $ } - $ }' - - -Qwen-VL-Chat -^^^^^^^^^^^^^^^^^ - -**启动服务** - -.. code-block:: console - - $ python -m lightllm.server.api_server - $ --model_dir ~/models/Qwen-VL-Chat \ - $ --trust_remote_code \ - $ --enable_multimodal - -**测试服务** - -.. code-block:: python - - import json - import requests - import base64 - - def run(query, uris): - images = [] - for uri in uris: - if uri.startswith("http"): - images.append({"type": "url", "data": uri}) - else: - with open(uri, 'rb') as fin: - b64 = base64.b64encode(fin.read()).decode("utf-8") - images.append({'type': "base64", "data": b64}) - - data = { - "inputs": query, - "parameters": { - "max_new_tokens": 200, - # The space before <|endoftext|> is important, - # the server will remove the first bos_token_id, - # but QWen tokenizer does not has bos_token_id - "stop_sequences": [" <|endoftext|>", " <|im_start|>", " <|im_end|>"], - }, - "multimodal_params": { - "images": images, - } - } - - url = "http://127.0.0.1:8000/generate" - headers = {'Content-Type': 'application/json'} - response = requests.post(url, headers=headers, data=json.dumps(data)) - return response - - query = """ - <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - - 这是什么?<|im_end|> - <|im_start|>assistant - """ - - response = run( - uris = [ - "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" - ], - query = query - ) - - if response.status_code == 200: - print(f"Result: {response.json()}") - else: - print(f"Error: {response.status_code}, {response.text}") - - - -llama2-70b-chat -^^^^^^^^^^^^^^^^^^^^^^^ - -**启动服务** - -.. code-block:: console - - $ python -m lightllm.server.api_server --model_dir ~/models/llama2-70b-chat --tp 4 - -.. tip:: - - :code:`--tp` 为4,表示使用四张卡进行张量并行。 - -**测试服务** - -.. code-block:: console - - $ curl http://localhost:8000/generate \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "inputs": "What is LLM?", - $ "parameters":{ - $ "max_new_tokens":170, - $ "frequency_penalty":1 - $ } - $ }' - - -internlm2-1_8b -^^^^^^^^^^^^^^^^^^^^^^^ - -**启动服务** - -.. code-block:: console - - $ python -m lightllm.server.api_server - $ --model_dir ~/models/internlm2-1_8b \ - $ --trust_remote_code - - -**测试服务** - -.. code-block:: console - - $ curl http://localhost:8000/generate \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "inputs": "What is LLM?", - $ "parameters":{ - $ "max_new_tokens":170, - $ "frequency_penalty":1 - $ } - $ }' - - -internlm2-1_8b-reward -^^^^^^^^^^^^^^^^^^^^^^^ - -**启动服务** - -.. code-block:: console - - $ python -m lightllm.server.api_server - $ --model_dir ~/models/internlm2-1_8b-reward \ - $ --use_reward_model \ - $ --trust_remote_code - -.. tip:: - - ``--use_reward_model`` 表示使用 reward 类模型必须要打开的选项。 - - -**测试服务** - -.. code-block:: python - - import json - import requests - - query = "<|im_start|>user\nHello! What's your name?<|im_end|>\n<|im_start|>assistant\nMy name is InternLM2! A helpful AI assistant. What can I do for you?<|im_end|>\n<|reward|>" - - url = "http://127.0.0.1:8000/get_score" - headers = {'Content-Type': 'application/json'} - - data = { - "chat": query, - "parameters": { - "frequency_penalty":1 - } - } - response = requests.post(url, headers=headers, data=json.dumps(data)) - - if response.status_code == 200: - print(f"Result: {response.json()}") - else: - print(f"Error: {response.status_code}, {response.text}") \ No newline at end of file diff --git a/docs/CN/source/server/api_server_args_zh.rst b/docs/CN/source/server/api_server_args_zh.rst deleted file mode 100755 index 41a9910ad..000000000 --- a/docs/CN/source/server/api_server_args_zh.rst +++ /dev/null @@ -1,12 +0,0 @@ -APIServer 参数详解 -============================= - - -使用方法 -++++++++++++ - -.. argparse:: - :module: lightllm.server.api_cli - :func: make_argument_parser - :prog: python -m lightllm.server.api_server - :nodefaultconst: diff --git a/docs/CN/source/server/benchmark.rst b/docs/CN/source/server/benchmark.rst deleted file mode 100755 index b7ef97cb3..000000000 --- a/docs/CN/source/server/benchmark.rst +++ /dev/null @@ -1,43 +0,0 @@ -服务性能评测 -================== - -部署完模型以后,对服务性能进行评测是非常重要的,通过服务性能的表现调整配置从而更好地利用显卡资源。 -本文中,我们使用 LLaMA-7B 模型,在80G的A800显卡上,比较了lightllm 和 vLLM==0.1.2 的性能。 -具体比较方式参考以下步骤: - -1. 下载数据集 -^^^^^^^^^^^^^^ - -.. code-block:: console - - $ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - -2. 开启模型服务 -^^^^^^^^^^^^^^^^^^^ - -.. code-block:: console - - $ python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto - - -3. 性能评测 -^^^^^^^^^^^^^^^^ - -.. code-block:: console - - $ cd test - $ python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200 - - -输出: - -.. code-block:: console - - read data set finish - total tokens: 494250 - Total time: 111.37 s - Throughput: 8.98 requests/s - Average latency: 43.52 s - Average latency per token: 0.15 s - Average latency per output token: 0.73 s \ No newline at end of file diff --git a/docs/CN/source/user/api_param.rst b/docs/CN/source/tutorial/api_param.rst similarity index 93% rename from docs/CN/source/user/api_param.rst rename to docs/CN/source/tutorial/api_param.rst index 1bc33d9ad..f3785dd61 100755 --- a/docs/CN/source/user/api_param.rst +++ b/docs/CN/source/tutorial/api_param.rst @@ -138,10 +138,4 @@ reward 类模型,获取对话分数 :: - Result: {'score': 0.4892578125, 'prompt_tokens': 39, 'finish_reason': 'stop'} - - -:code:`POST /v1/chat/completions` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -openai 格式接口, 请查看 `openai 接口文档 `_ 查看更多信息。 + Result: {'score': 0.4892578125, 'prompt_tokens': 39, 'finish_reason': 'stop'} \ No newline at end of file diff --git a/docs/CN/source/tutorial/api_server_args_zh.rst b/docs/CN/source/tutorial/api_server_args_zh.rst new file mode 100755 index 000000000..d7c055ef4 --- /dev/null +++ b/docs/CN/source/tutorial/api_server_args_zh.rst @@ -0,0 +1,511 @@ +APIServer 参数详解 +================ + +本文档详细介绍了 LightLLM APIServer 的所有启动参数及其用法。 + +基础配置参数 +----------- + +.. option:: --run_mode + + 设置运行模式,可选值: + + * ``normal``: 单服务器模式(默认) + * ``prefill``: 预填充模式(用于 pd 分离运行模式) + * ``decode``: 解码模式(用于 pd 分离运行模式) + * ``pd_master``: pd 主节点模式(用于 pd 分离运行模式) + * ``config_server``: 配置服务器模式(用于 pd 分离模式,用于注册 pd_master 节点并获取 pd_master 节点列表),专门为大规模、高并发场景设计,当 `pd_master` 遇到显著的 CPU 瓶颈时使用。 + +.. option:: --host + + 服务器监听地址,默认为 ``127.0.0.1`` + +.. option:: --port + + 服务器监听端口,默认为 ``8000`` + +.. option:: --httpserver_workers + + HTTP 服务器工作进程数,默认为 ``1`` + +.. option:: --zmq_mode + + ZMQ 通信模式,可选值: + + * ``tcp://``: TCP 模式 + * ``ipc:///tmp/``: IPC 模式(默认) + + 只能在 ``['tcp://', 'ipc:///tmp/']`` 中选择 + +PD 分离模式参数 +-------------- + +.. option:: --pd_master_ip + + PD 主节点 IP 地址,默认为 ``0.0.0.0`` + + 当 run_mode 设置为 prefill 或 decode 时需要设置此参数 + +.. option:: --pd_master_port + + PD 主节点端口,默认为 ``1212`` + + 当 run_mode 设置为 prefill 或 decode 时需要设置此参数 + +.. option:: --pd_decode_rpyc_port + + PD 模式下解码节点用于 kv move manager rpyc 服务器的端口,默认为 ``42000`` + +.. option:: --config_server_host + + 配置服务器模式下的主机地址 + +.. option:: --config_server_port + + 配置服务器模式下的端口号 + +模型配置参数 +----------- + +.. option:: --model_name + + 模型名称,用于区分内部模型名称,默认为 ``default_model_name`` + + 可通过 ``host:port/get_model_name`` 获取 + +.. option:: --model_dir + + 模型权重目录路径,应用将从该目录加载配置、权重和分词器 + +.. option:: --tokenizer_mode + + 分词器加载模式,可选值: + + * ``slow``: 慢速模式,加载快但运行慢,适合调试和测试 + * ``fast``: 快速模式(默认),获得最佳性能 + * ``auto``: 自动模式,尝试使用快速模式,失败则使用慢速模式 + +.. option:: --load_way + + 模型权重加载方式,默认为 ``HF``(Huggingface 格式) + + llama 模型还支持 ``DS``(Deepspeed)格式 + +.. option:: --trust_remote_code + + 是否允许在 Hub 上使用自定义模型定义的文件 + +内存和批处理参数 +-------------- + +.. option:: --max_total_token_num + + GPU 和模型支持的总 token 数量,等于 max_batch * (input_len + output_len) + + 如果不指定,将根据 mem_fraction 自动计算 + +.. option:: --mem_fraction + + 内存使用比例,默认为 ``0.9`` + + 如果运行时出现 OOM,可以指定更小的值 + +.. option:: --batch_max_tokens + + 新批次的最大 token 数量,控制预填充批次大小以防止 OOM + +.. option:: --running_max_req_size + + 同时进行前向推理的最大请求数量,默认为 ``1000`` + +.. option:: --max_req_total_len + + 请求输入长度 + 请求输出长度的最大值,默认为 ``16384`` + +.. option:: --eos_id + + 结束停止 token ID,可以指定多个值。如果为 None,将从 config.json 加载 + +.. option:: --tool_call_parser + + openai接口工具调用解析器类型,可选值: + + * ``qwen25`` + * ``llama3`` + * ``mistral`` + +不同并行模式设置参数 +------------------ + +.. option:: --nnodes + + 节点数量,默认为 ``1`` + +.. option:: --node_rank + + 当前节点的排名,默认为 ``0`` + +.. option:: --multinode_httpmanager_port + + 多节点 HTTP 管理器端口,默认为 ``12345`` + +.. option:: --multinode_router_gloo_port + + 多节点路由器 gloo 端口,默认为 ``20001`` + +.. option:: --tp + + 模型张量并行大小,默认为 ``1`` + +.. option:: --dp + + 数据并行大小,默认为 ``1`` + + 这是 deepseekv2 的有用参数。使用 deepseekv2 模型时,将 dp 设置为等于 tp 参数。 + 其他情况下请不要设置,保持默认值 1。 + +.. option:: --nccl_host + + 用于构建 PyTorch 分布式环境的 nccl_host,默认为 ``127.0.0.1`` + + 多节点部署时,应设置为主节点的 IP + +.. option:: --nccl_port + + 用于构建 PyTorch 分布式环境的 nccl_port,默认为 ``28765`` + +.. option:: --use_config_server_to_init_nccl + + 使用由 config_server 启动的 tcp store 服务器初始化 nccl,默认为 False + + 设置为 True 时,--nccl_host 必须等于 config_server_host,--nccl_port 对于 config_server 必须是唯一的, + 不要为不同的推理节点使用相同的 nccl_port,这将是严重错误 + +attention类型选择参数 +-------------------- + +.. option:: --mode + + 模型推理模式,可以指定多个值: + + * ``triton_int8kv``: 使用 int8 存储 kv cache,可增加 token 容量,使用 triton kernel + * ``ppl_int8kv``: 使用 int8 存储 kv cache,使用 ppl 快速 kernel + * ``ppl_fp16``: 使用 ppl 快速 fp16 解码注意力 kernel + * ``triton_flashdecoding``: 用于长上下文的 flashdecoding 模式,当前支持 llama llama2 qwen + * ``triton_gqa_attention``: 使用 GQA 的模型的快速 kernel + * ``triton_gqa_flashdecoding``: 使用 GQA 的模型的快速 flashdecoding kernel + * ``triton_fp8kv``: 使用 float8 存储 kv cache,目前仅用于 deepseek2 + + 需要阅读源代码以确认所有模型支持的具体模式 + +调度参数 +------------ + +.. option:: --router_token_ratio + + 判断服务是否繁忙的阈值,默认为 ``0.0``,一旦kv cache 使用率超过此值,则会直接变为保守调度。 + +.. option:: --router_max_new_token_len + + 调度器评估请求kv占用时,使用的请求输出长度,默认为 ``1024``,一般低于用户设置的max_new_tokens。该参数只在 --router_token_ratio 大于0时生效。 + 设置改参数,会使请求调度更为激进,系统同时处理的请求数会更多,同时也会不可避免的造成请求的暂停重计算。 + +.. option:: --router_max_wait_tokens + + 每 router_max_wait_tokens 解码步骤后触发一次调度新请求,默认为 ``6`` + +.. option:: --disable_aggressive_schedule + + 禁用激进调度 + + 激进调度可能导致解码期间频繁的预填充中断。禁用它可以让 router_max_wait_tokens 参数更有效地工作。 + +.. option:: --disable_dynamic_prompt_cache + + 禁用kv cache 缓存 + +.. option:: --chunked_prefill_size + + 分块预填充大小,默认为 ``4096`` + +.. option:: --disable_chunked_prefill + + 是否禁用分块预填充 + +.. option:: --diverse_mode + + 多结果输出模式 + + +输出约束参数 +----------- + +.. option:: --token_healing_mode + +.. option:: --output_constraint_mode + + 设置输出约束后端,可选值: + + * ``outlines``: 使用 outlines 后端 + * ``xgrammar``: 使用 xgrammar 后端 + * ``none``: 无输出约束(默认) + +.. option:: --first_token_constraint_mode + + 约束第一个 token 的允许范围 + 使用环境变量 FIRST_ALLOWED_TOKENS 设置范围,例如 FIRST_ALLOWED_TOKENS=1,2 + +多模态参数 +-------- + +.. option:: --enable_multimodal + + 是否允许加载额外的视觉模型 + +.. option:: --enable_multimodal_audio + + 是否允许加载额外的音频模型(需要 --enable_multimodal) + +.. option:: --enable_mps + + 是否为多模态服务启用 nvidia mps + +.. option:: --cache_capacity + + 多模态资源的缓存服务器容量,默认为 ``200`` + +.. option:: --cache_reserved_ratio + + 缓存服务器清理后的保留容量比例,默认为 ``0.5`` + +.. option:: --visual_infer_batch_size + + 每次推理批次中处理的图像数量,默认为 ``1`` + +.. option:: --visual_gpu_ids + + 要使用的 GPU ID 列表,例如 0 1 2 + +.. option:: --visual_tp + + ViT 的张量并行实例数量,默认为 ``1`` + +.. option:: --visual_dp + + ViT 的数据并行实例数量,默认为 ``1`` + +.. option:: --visual_nccl_ports + + 为 ViT 构建分布式环境的 NCCL 端口列表,例如 29500 29501 29502,默认为 [29500] + +性能优化参数 +----------- + +.. option:: --disable_custom_allreduce + + 是否禁用自定义 allreduce + +.. option:: --enable_custom_allgather + + 是否启用自定义 allgather + +.. option:: --enable_tpsp_mix_mode + + 推理后端将使用 TP SP 混合运行模式 + + 目前仅支持 llama 和 deepseek系列 模型 + +.. option:: --enable_prefill_microbatch_overlap + + 推理后端将为预填充使用微批次重叠模式 + + 目前仅支持 deepseek系列 模型 + +.. option:: --enable_decode_microbatch_overlap + + 推理后端将为解码使用微批次重叠模式 + +.. option:: --enable_flashinfer_prefill + + 推理后端将为预填充使用 flashinfer 的注意力 kernel + +.. option:: --enable_flashinfer_decode + + 推理后端将为解码使用 flashinfer 的注意力 kernel + +.. option:: --enable_fa3 + + 推理后端将为预填充和解码使用 fa3 注意力 kernel + +.. option:: --disable_cudagraph + + 禁用解码阶段的 cudagraph + +.. option:: --graph_max_batch_size + + 解码阶段可以被 cuda graph 捕获的最大批次大小,默认为 ``256`` + +.. option:: --graph_split_batch_size + + 控制解码期间生成 CUDA graph 的间隔,默认为 ``32`` + + 对于从 1 到指定 graph_split_batch_size 的值,将连续生成 CUDA graph。 + 对于从 graph_split_batch_size 到 graph_max_batch_size 的值, + 每增加 graph_grow_step_size 就会生成一个新的 CUDA graph。 + 正确配置此参数可以帮助优化 CUDA graph 执行的性能。 + +.. option:: --graph_grow_step_size + + 对于从 graph_split_batch_size 到 graph_max_batch_size 的 batch_size 值, + 每增加 graph_grow_step_size 就会生成一个新的 CUDA graph,默认为 ``16`` + +.. option:: --graph_max_len_in_batch + + 解码阶段可以被 cuda graph 捕获的最大序列长度,默认为 ``0`` + + 默认值为 8192。如果遇到更大的值,将转为 eager 模式。 + +量化参数 +------- + +.. option:: --quant_type + + 量化方法,可选值: + + * ``ppl-w4a16-128`` + * ``flashllm-w6a16`` + * ``ao-int4wo-[32,64,128,256]`` + * ``ao-int8wo`` + * ``ao-fp8w8a16`` + * ``ao-fp6w6a16`` + * ``vllm-w8a8`` + * ``vllm-fp8w8a8`` + * ``vllm-fp8w8a8-b128`` + * ``triton-fp8w8a8-block128`` + * ``none``(默认) + +.. option:: --quant_cfg + + 量化配置文件路径。可用于混合量化。 + + 示例可以在 test/advanced_config/mixed_quantization/llamacls-mix-down.yaml 中找到。 + +.. option:: --vit_quant_type + + ViT 量化方法,可选值: + + * ``ppl-w4a16-128`` + * ``flashllm-w6a16`` + * ``ao-int4wo-[32,64,128,256]`` + * ``ao-int8wo`` + * ``ao-fp8w8a16`` + * ``ao-fp6w6a16`` + * ``vllm-w8a8`` + * ``vllm-fp8w8a8`` + * ``none``(默认) + +.. option:: --vit_quant_cfg + + ViT 量化配置文件路径。可用于混合量化。 + + 示例可以在 lightllm/common/quantization/configs 中找到。 + +采样和生成参数 +------------ + +.. option:: --sampling_backend + + 采样使用的实现,可选值: + + * ``triton``: 使用 torch 和 triton kernel(默认) + * ``sglang_kernel``: 使用 sglang_kernel 实现 + +.. option:: --return_all_prompt_logprobs + + 返回所有提示 token 的 logprobs + +.. option:: --use_reward_model + + 使用奖励模型 + +.. option:: --long_truncation_mode + + 当 input_token_len + max_new_tokens > max_req_total_len 时的处理方式,可选值: + + * ``None``: 抛出异常(默认) + * ``head``: 移除一些头部 token 使 input_token_len + max_new_tokens <= max_req_total_len + * ``center``: 移除中心位置的一些 token 使 input_token_len + max_new_tokens <= max_req_total_len + +.. option:: --use_tgi_api + + 使用 tgi 输入和输出格式 + +MTP 多预测参数 +------------ + +.. option:: --mtp_mode + + 支持的 mtp 模式,可选值: + + * ``deepseekv3`` + * ``None``: 不启用 mtp(默认) + +.. option:: --mtp_draft_model_dir + + MTP 多预测功能的草稿模型路径 + + 用于加载 MTP 多输出 token 模型。 + +.. option:: --mtp_step + + 指定使用草稿模型预测的额外 token 数量,默认为 ``0`` + + 目前此功能仅支持 DeepSeekV3/R1 模型。 + 增加此值允许更多预测,但确保模型与指定的步数兼容。 + 目前 deepseekv3/r1 模型仅支持 1 步 + +DeepSeek 冗余专家参数 +---------- + +.. option:: --ep_redundancy_expert_config_path + + 冗余专家配置的路径。可用于 deepseekv3 模型。 + +.. option:: --auto_update_redundancy_expert + + 是否通过在线专家使用计数器为 deepseekv3 模型更新冗余专家。 + +监控和日志参数 +------------ + +.. option:: --disable_log_stats + + 禁用吞吐量统计日志记录 + +.. option:: --log_stats_interval + + 记录统计信息的间隔(秒),默认为 ``10`` + +.. option:: --health_monitor + + 检查服务健康状态并在出错时重启 + +.. option:: --metric_gateway + + 收集监控指标的地址 + +.. option:: --job_name + + 监控的作业名称,默认为 ``lightllm`` + +.. option:: --grouping_key + + 监控的分组键,格式为 key=value,可以指定多个 + +.. option:: --push_interval + + 推送监控指标的间隔(秒),默认为 ``10`` + +.. option:: --enable_monitor_auth + + 是否为 push_gateway 开启身份验证 \ No newline at end of file diff --git a/docs/CN/source/tutorial/deepseek_deployment.rst b/docs/CN/source/tutorial/deepseek_deployment.rst new file mode 100644 index 000000000..f017ee1f7 --- /dev/null +++ b/docs/CN/source/tutorial/deepseek_deployment.rst @@ -0,0 +1,346 @@ +.. _deepseek_deployment: + +DeepSeek 模型部署指南 +===================== + +LightLLM 支持多种 DeepSeek 模型的部署方案,包括 DeepSeek-R1、DeepSeek-V2、DeepSeek-V3 等。本文档详细介绍各种部署模式和配置方案。 + +部署模式概览 +----------- + +LightLLM 支持以下几种部署模式: + +1. **单机 TP 模式**: 使用张量并行在单机上部署 +2. **单机 EP 模式**: 使用专家并行在单机上部署 +3. **多机 TP 模式**: 跨多台机器使用张量并行 +4. **多机 EP 模式**: 跨多台机器使用专家并行 +5. **PD 分离模式**: 将预填充和解码分离部署 +6. **多 PD Master 模式**: 支持多个 PD Master 节点 + +1. 单机部署方案 +--------------- + +1.1 单机 TP 模式 (Tensor Parallel) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +适用于单台 H200 机器部署 DeepSeek-R1 模型。 + +**启动命令:** + +.. code-block:: bash + + # H200 单机 DeepSeek-R1 TP 模式 + LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 8 \ + --enable_fa3 + +**参数说明:** +- `LOADWORKER=18`: 模型加载线程数,提高加载速度 +- `--tp 8`: 张量并行度,使用8个GPU +- `--enable_fa3`: 启用 Flash Attention 3.0 +- `--port 8088`: 服务端口 + +1.2 单机 DP + EP 模式 (Data Parallel + Expert Parallel) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +适用于 DeepSeek-V2/V3 等 MoE 模型的专家并行部署。 + +**启动命令:** + +.. code-block:: bash + + # H200 单机 DeepSeek-R1 DP + EP 模式 + MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 8 \ + --dp 8 \ + --enable_fa3 + +**参数说明:** +- `MOE_MODE=EP`: 设置专家并行模式 +- `--tp 8`: 张量并行度 +- `--dp 8`: 数据并行度,通常设置为与 tp 相同的值 +- `--enable_fa3`: 启用 Flash Attention 3.0 + +**可选优化参数:** +- `--enable_prefill_microbatch_overlap`: 启用预填充微批次重叠 +- `--enable_decode_microbatch_overlap`: 启用解码微批次重叠 + +2. 多机部署方案 +--------------- + +2.1 多机 TP 模式 +~~~~~~~~~~~~~~~~ + +适用于跨多台 H200/H100 机器部署。 + +**Node 0 启动命令:** + +.. code-block:: bash + + # H200/H100 多机 DeepSeek-R1 TP 模式 Node 0 + # 使用方法: sh multi_node_tp_node0.sh + export nccl_host=$1 + LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 16 \ + --enable_fa3 \ + --nnodes 2 \ + --node_rank 0 \ + --nccl_host $nccl_host \ + --nccl_port 2732 + +**Node 1 启动命令:** + +.. code-block:: bash + + # H200/H100 多机 DeepSeek-R1 TP 模式 Node 1 + # 使用方法: sh multi_node_tp_node1.sh + export nccl_host=$1 + LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 16 \ + --enable_fa3 \ + --nnodes 2 \ + --node_rank 1 \ + --nccl_host $nccl_host \ + --nccl_port 2732 + +**参数说明:** +- `--nnodes 2`: 总节点数 +- `--node_rank 0/1`: 当前节点排名 +- `--nccl_host`: NCCL 通信主机地址 +- `--nccl_port 2732`: NCCL 通信端口 + +2.2 多机 EP 模式 +~~~~~~~~~~~~~~~~ + +适用于跨多台机器部署 MoE 模型。 + +**Node 0 启动命令:** + +.. code-block:: bash + + # H200 多机 DeepSeek-R1 EP 模式 Node 0 + # 使用方法: sh multi_node_ep_node0.sh + export nccl_host=$1 + MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 16 \ + --dp 16 \ + --enable_fa3 \ + --nnodes 2 \ + --node_rank 0 \ + --nccl_host $nccl_host \ + --nccl_port 2732 + +**Node 1 启动命令:** + +.. code-block:: bash + + # H200 多机 DeepSeek-R1 EP 模式 Node 1 + # 使用方法: sh multi_node_ep_node1.sh + export nccl_host=$1 + MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 16 \ + --dp 16 \ + --enable_fa3 \ + --nnodes 2 \ + --node_rank 1 \ + --nccl_host $nccl_host \ + --nccl_port 2732 + +**可选优化参数:** +- `--enable_prefill_microbatch_overlap`: 启用预填充微批次重叠 +- `--enable_decode_microbatch_overlap`: 启用解码微批次重叠 + +3. PD 分离部署方案 +------------------ + +PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署,可以更好地利用硬件资源。 + +3.1 单 PD Master 模式 +~~~~~~~~~~~~~~~~~~~~~ + +**步骤 1: 启动 PD Master 服务** + +.. code-block:: bash + + # PD Master for DeepSeek-R1 + # 使用方法: sh pd_master.sh + export pd_master_ip=$1 + python -m lightllm.server.api_server --model_dir /path/DeepSeek-R1 \ + --run_mode "pd_master" \ + --host $pd_master_ip \ + --port 60011 + +**步骤 2: 启动 Prefill 服务** + +.. code-block:: bash + + # PD prefill 模式 for DeepSeek-R1 (DP+EP) on H200 + # 使用方法: sh pd_prefill.sh + # nvidia-cuda-mps-control -d,运行MPS(可选, 有mps支持性能会好特别多,但是部分显卡和驱动环境开启mps会容易出现错误,建议升级驱动到较高版本,特别是H系列卡) + + export host=$1 + export pd_master_ip=$2 + nvidia-cuda-mps-control -d + MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \ + --model_dir /path/DeepSeek-R1 \ + --run_mode "prefill" \ + --tp 8 \ + --dp 8 \ + --host $host \ + --port 8019 \ + --nccl_port 2732 \ + --enable_fa3 \ + --disable_cudagraph \ + --pd_master_ip $pd_master_ip \ + --pd_master_port 60011 + # if you want to enable microbatch overlap, you can uncomment the following lines + #--enable_prefill_microbatch_overlap + +**步骤 3: 启动 Decode 服务** + +.. code-block:: bash + + # PD decode 模式 for DeepSeek-R1 (DP+EP) on H200 + # 使用方法: sh pd_decode.sh + export host=$1 + export pd_master_ip=$2 + nvidia-cuda-mps-control -d + MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \ + --model_dir /path/DeepSeek-R1 \ + --run_mode "decode" \ + --tp 8 \ + --dp 8 \ + --host $host \ + --port 8121 \ + --nccl_port 12322 \ + --enable_fa3 \ + --disable_cudagraph \ + --pd_master_ip $pd_master_ip \ + --pd_master_port 60011 + # if you want to enable microbatch overlap, you can uncomment the following lines + #--enable_decode_microbatch_overlap + +3.2 多 PD Master 模式 +~~~~~~~~~~~~~~~~~~~~~ + +支持多个 PD Master 节点,提供更好的负载均衡和高可用性。 + +**步骤 1: 启动 Config Server** + +.. code-block:: bash + + # Config Server + # 使用方法: sh config_server.sh + export config_server_host=$1 + python -m lightllm.server.api_server \ + --run_mode "config_server" \ + --config_server_host $config_server_host \ + --config_server_port 60088 + +**步骤 2: 启动多个 PD Master** + +.. code-block:: bash + + # PD Master 1 + # 使用方法: sh pd_master_1.sh + export host=$1 + export config_server_host=$2 + python -m lightllm.server.api_server \ + --model_dir /path/DeepSeek-R1 \ + --run_mode "pd_master" \ + --host $host \ + --port 60011 \ + --config_server_host $config_server_host \ + --config_server_port 60088 + + # PD Master 2 + # 使用方法: sh pd_master_2.sh + export host=$1 + export config_server_host=$2 + python -m lightllm.server.api_server \ + --model_dir /path/DeepSeek-R1 \ + --run_mode "pd_master" \ + --host $host \ + --port 60012 \ + --config_server_host $config_server_host \ + --config_server_port 60088 + +**步骤 3: 启动 Prefill 和 Decode 服务** + +.. code-block:: bash + + # Prefill 服务 + export host=$1 + export config_server_host=$2 + nvidia-cuda-mps-control -d + MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \ + --model_dir /path/DeepSeek-R1 \ + --run_mode "prefill" \ + --host $host \ + --port 8019 \ + --tp 8 \ + --dp 8 \ + --nccl_port 2732 \ + --enable_fa3 \ + --disable_cudagraph \ + --config_server_host $config_server_host \ + --config_server_port 60088 + # if you want to enable microbatch overlap, you can uncomment the following lines + #--enable_prefill_microbatch_overlap + + # Decode 服务 + export host=$1 + export config_server_host=$2 + nvidia-cuda-mps-control -d + MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \ + --model_dir /path/DeepSeek-R1 \ + --run_mode "decode" \ + --host $host \ + --port 8121 \ + --nccl_port 12322 \ + --tp 8 \ + --dp 8 \ + --enable_fa3 \ + --config_server_host $config_server_host \ + --config_server_port 60088 + # if you want to enable microbatch overlap, you can uncomment the following lines + #--enable_decode_microbatch_overlap + +4. 测试和验证 +------------- + +4.1 基础功能测试 +~~~~~~~~~~~~~~~ + +.. code-block:: bash + + curl http://server_ip:server_port/generate \ + -H "Content-Type: application/json" \ + -d '{ + "inputs": "What is AI?", + "parameters":{ + "max_new_tokens":17, + "frequency_penalty":1 + } + }' + +4.2 性能基准测试 +~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # DeepSeek-R1 性能测试 + cd test + python benchmark_client.py \ + --num_clients 100 \ + --input_num 2000 \ + --tokenizer_path /path/DeepSeek-R1/ \ + --url http://127.0.0.1:8088/generate_stream + +以上所有脚本可以参考 `test/start_scripts/multi_pd_master/` 目录下的脚本。 \ No newline at end of file diff --git a/docs/CN/source/tutorial/multimodal.rst b/docs/CN/source/tutorial/multimodal.rst new file mode 100644 index 000000000..61c73109c --- /dev/null +++ b/docs/CN/source/tutorial/multimodal.rst @@ -0,0 +1,140 @@ +多模态模型启动配置 +============================ + +LightLLM支持多种多模态模型的推理,下面以InternVL为例,对多模态服务的启动命令进行说明。 + +基本启动命令 +------------ + +.. code-block:: bash + + INTERNVL_IMAGE_LENGTH=256 \ + LOADWORKER=12 \ + python -m lightllm.server.api_server \ + --port 8080 \ + --tp 2 \ + --model_dir ${MODEL_PATH} \ + --mem_fraction 0.8 \ + --trust_remote_code \ + --enable_multimodal + +核心参数说明 +------------ + +环境变量 +^^^^^^^^ + +- **INTERNVL_IMAGE_LENGTH**: 设置InternVL模型的图像token长度,默认为256 +- **LOADWORKER**: 设置模型加载的工作进程数 + +基础服务参数 +^^^^^^^^^^^ + +- **--port 8080**: API服务器监听端口 +- **--tp 2**: 张量并行度(Tensor Parallelism) +- **--model_dir**: InternVL模型文件路径 +- **--mem_fraction 0.8**: GPU显存使用比例 +- **--trust_remote_code**: 允许加载自定义模型代码 +- **--enable_multimodal**: 启用多模态功能 + +高级配置参数 +------------ + +.. code-block:: bash + + --visual_infer_batch_size 2 \ + --cache_capacity 500 \ + --visual_dp dp_size \ + --visual_tp tp_size + +- **--visual_infer_batch_size 2**: 视觉推理批处理大小 +- **--cache_capacity 500**: 图像嵌入缓存容量 +- **--visual_dp 2**: 视觉模型数据并行度 +- **--visual_tp 2**: 视觉模型张量并行度 + +.. note:: 为了使每一个GPU的显存负载相同,需要visual_dp * visual_tp = tp,例如tp=2,则visual_dp=1, visual_tp=2。 + +ViT部署方式 +----------- + +ViT TP (张量并行) +^^^^^^^^^^^^^^^^^ + +- 默认使用 +- --visual_tp tp_size 开启张量并行 + +ViT DP (数据并行) +^^^^^^^^^^^^^^^^^ + +- 将不同图像批次分布到多个GPU +- 每个GPU运行完整ViT模型副本 +- --visual_dp dp_size 开启数据并行 + +图像缓存机制 +------------ +LightLLM 会对输入图片的embeddings进行缓存,多轮对话中,如果图片相同,则可以直接使用缓存的embeddings,避免重复推理。 + +- **--cache_capacity**: 控制缓存的image embed数量 +- 根据图片MD5哈希值进行匹配 +- 采用LRU(最近最少使用)淘汰机制 +- 命中的图片cache可直接跳过ViT推理 + + +测试 +------------ + +.. code-block:: python + + import json + import requests + import base64 + + def run(query, uris): + images = [] + for uri in uris: + if uri.startswith("http"): + images.append({"type": "url", "data": uri}) + else: + with open(uri, 'rb') as fin: + b64 = base64.b64encode(fin.read()).decode("utf-8") + images.append({'type': "base64", "data": b64}) + + data = { + "inputs": query, + "parameters": { + "max_new_tokens": 200, + # The space before <|endoftext|> is important, + # the server will remove the first bos_token_id, + # but QWen tokenizer does not has bos_token_id + "stop_sequences": [" <|endoftext|>", " <|im_start|>", " <|im_end|>"], + }, + "multimodal_params": { + "images": images, + } + } + + url = "http://127.0.0.1:8000/generate" + headers = {'Content-Type': 'application/json'} + response = requests.post(url, headers=headers, data=json.dumps(data)) + return response + + query = """ + <|im_start|>system + You are a helpful assistant.<|im_end|> + <|im_start|>user + + 这是什么?<|im_end|> + <|im_start|>assistant + """ + + response = run( + uris = [ + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" + ], + query = query + ) + + if response.status_code == 200: + print(f"Result: {response.json()}") + else: + print(f"Error: {response.status_code}, {response.text}") diff --git a/docs/CN/source/tutorial/openai.rst b/docs/CN/source/tutorial/openai.rst new file mode 100644 index 000000000..c98b0dd9f --- /dev/null +++ b/docs/CN/source/tutorial/openai.rst @@ -0,0 +1,250 @@ +.. _openai_api: + +LightLLM OpenAI 接口调用示例 +============================ + +LightLLM 提供了与 OpenAI API 完全兼容的接口,支持所有标准的 OpenAI 功能,包括 function calling。本文档将详细介绍如何使用 LightLLM 的 OpenAI 接口。 + +基础配置 +-------- + +首先确保 LightLLM 服务已经启动: + +.. code-block:: bash + + # 启动 LightLLM 服务 + python -m lightllm.server.api_server \ + --model_dir /path/to/your/model \ + --port 8088 \ + --tp 1 + +基础对话示例 +------------ + +1. 简单对话 +~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + # 配置 + url = "http://localhost:8088/v1/chat/completions" + headers = {"Content-Type": "application/json"} + + # 请求数据 + data = { + "model": "your_model_name", + "messages": [ + {"role": "user", "content": "你好,请介绍一下你自己"} + ], + "temperature": 0.7, + "max_tokens": 1000 + } + + # 发送请求 + response = requests.post(url, headers=headers, json=data) + + if response.status_code == 200: + result = response.json() + print("回复:", result["choices"][0]["message"]["content"]) + else: + print("错误:", response.status_code, response.text) + +2. 流式对话 +~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + headers = {"Content-Type": "application/json"} + + data = { + "model": "your_model_name", + "messages": [ + {"role": "user", "content": "请写一个关于人工智能的短文"} + ], + "stream": True, + "temperature": 0.7, + "max_tokens": 1000 + } + + # 流式请求 + response = requests.post(url, headers=headers, json=data, stream=True) + + if response.status_code == 200: + for line in response.iter_lines(): + if line: + line = line.decode('utf-8') + if line.startswith('data: '): + data_str = line[6:] # 移除 "data: " 前缀 + if data_str == '[DONE]': + break + try: + chunk = json.loads(data_str) + if chunk['choices'][0]['delta'].get('content'): + print(chunk['choices'][0]['delta']['content'], end='', flush=True) + except json.JSONDecodeError: + continue + else: + print("错误:", response.status_code, response.text) + +Function Calling 示例 +-------------------- + +LightLLM 支持 OpenAI 的 function calling 功能,提供了三种模型的函数调用解析,启动服务的时候指定 --tool_call_parser 参数来选择。启动服务命令为: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/your/model \ + --port 8088 \ + --tp 1 \ + --tool_call_parser qwen25 + # 可选的参数为 qwen25, llama3, mistral + +1. 基础 Function Calling +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + headers = {"Content-Type": "application/json"} + + # 定义函数 + tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "获取指定城市的当前天气信息", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "城市名称,例如:北京、上海" + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "温度单位" + } + }, + "required": ["city"] + } + } + } + ] + + # 请求数据 + data = { + "model": "your_model_name", + "messages": [ + {"role": "user", "content": "北京今天天气怎么样?"} + ], + "tools": tools, + "tool_choice": "auto", # 让模型自动决定是否调用函数 + "temperature": 0.7, + "max_tokens": 1000 + } + + # 发送请求 + response = requests.post(url, headers=headers, json=data) + + if response.status_code == 200: + result = response.json() + message = result["choices"][0]["message"] + + # 检查是否有函数调用 + if message.get("tool_calls"): + print("模型决定调用函数:") + for tool_call in message["tool_calls"]: + print(f"函数名: {tool_call['function']['name']}") + print(f"参数: {tool_call['function']['arguments']}") + else: + print("回复:", message["content"]) + else: + print("错误:", response.status_code, response.text) + +2. 流式 Function Calling +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + headers = {"Content-Type": "application/json"} + + tools = [ + { + "type": "function", + "function": { + "name": "calculate", + "description": "执行数学计算", + "parameters": { + "type": "object", + "properties": { + "expression": {"type": "string", "description": "数学表达式"} + }, + "required": ["expression"] + } + } + } + ] + + data = { + "model": "your_model_name", + "messages": [ + {"role": "user", "content": "请计算 25 * 4 + 10 的结果"} + ], + "tools": tools, + "tool_choice": "auto", + "stream": True, + "temperature": 0.7, + "max_tokens": 1000 + } + + response = requests.post(url, headers=headers, json=data, stream=True) + + if response.status_code == 200: + content_buffer = "" + tool_calls_buffer = [] + + for line in response.iter_lines(): + if line: + line = line.decode('utf-8') + if line.startswith('data: '): + data_str = line[6:] + if data_str == '[DONE]': + break + try: + chunk = json.loads(data_str) + delta = chunk['choices'][0]['delta'] + + # 处理内容 + if delta.get('content'): + content_buffer += delta['content'] + print(delta['content'], end='', flush=True) + + # 处理函数调用 + if delta.get('tool_calls'): + for tool_call in delta['tool_calls']: + tool_calls_buffer.append(tool_call) + print(f"\n[函数调用: {tool_call['function']['name']}]") + if tool_call['function'].get('arguments'): + print(f"参数: {tool_call['function']['arguments']}") + + except json.JSONDecodeError: + continue + else: + print("错误:", response.status_code, response.text) diff --git a/docs/CN/source/tutorial/reward_model.rst b/docs/CN/source/tutorial/reward_model.rst new file mode 100644 index 000000000..46725ee22 --- /dev/null +++ b/docs/CN/source/tutorial/reward_model.rst @@ -0,0 +1,62 @@ +奖励模型部署配置 +============================ + +LightLLM 支持多种奖励模型的推理,用于评估对话质量和生成奖励分数。目前支持的奖励模型包括 InternLM2 Reward 和 Qwen2 Reward 等。 + +基本启动命令 +------------ + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --port 8080 \ + --model_dir ${MODEL_PATH} \ + --trust_remote_code \ + --use_reward_model # 启用奖励模型功能(必需参数) + +测试示例 +-------- + +Python 测试代码 +^^^^^^^^^^^^^^^ + +.. code-block:: python + + import json + import requests + + # InternLM2 Reward 测试 + query = "<|im_start|>user\nHello! What's your name?<|im_end|>\n<|im_start|>assistant\nMy name is InternLM2! A helpful AI assistant. What can I do for you?<|im_end|>\n<|reward|>" + + url = "http://127.0.0.1:8000/get_score" + headers = {'Content-Type': 'application/json'} + + data = { + "chat": query, + "parameters": { + "frequency_penalty": 1 + } + } + + response = requests.post(url, headers=headers, data=json.dumps(data)) + + if response.status_code == 200: + result = response.json() + print(f"奖励分数: {result['score']}") + print(f"输入token数: {result['prompt_tokens']}") + else: + print(f"错误: {response.status_code}, {response.text}") + +cURL 测试命令 +^^^^^^^^^^^^ + +.. code-block:: bash + + curl http://localhost:8000/get_score \ + -H "Content-Type: application/json" \ + -d '{ + "chat": "<|im_start|>user\nHello! What is AI?<|im_end|>\n<|im_start|>assistant\nAI stands for Artificial Intelligence, which refers to the simulation of human intelligence in machines.<|im_end|>\n<|reward|>", + "parameters": { + "frequency_penalty": 1 + } + }' \ No newline at end of file diff --git a/docs/CN/source/user/openapi_docs.rst b/docs/CN/source/user/openapi_docs.rst deleted file mode 100755 index e0921a27f..000000000 --- a/docs/CN/source/user/openapi_docs.rst +++ /dev/null @@ -1,43 +0,0 @@ -OpenApi docs -================================= - -下面的文档由openapi自动生成,在使用Lightllm部署完以后,使用 ``host:port/docs`` 就可以打开 - -.. raw:: html - - - - - - FastAPI - Swagger UI - - - -
-
- - - - - diff --git a/docs/EN/source/_static/openapi.json b/docs/EN/source/_static/openapi.json deleted file mode 100755 index d591ecf0a..000000000 --- a/docs/EN/source/_static/openapi.json +++ /dev/null @@ -1,536 +0,0 @@ -{ - "openapi": "3.0.2", - "info": { - "title": "FastAPI", - "version": "0.1.0" - }, - "paths": { - "/liveness": { - "get": { - "summary": "Liveness", - "operationId": "liveness_liveness_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - }, - "post": { - "summary": "Liveness", - "operationId": "liveness_liveness_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/readiness": { - "get": { - "summary": "Readiness", - "operationId": "readiness_readiness_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - }, - "post": { - "summary": "Readiness", - "operationId": "readiness_readiness_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/health": { - "get": { - "summary": "Check server health", - "operationId": "healthcheck_health_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - }, - "head": { - "summary": "Check server health", - "operationId": "healthcheck_health_head", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/healthz": { - "get": { - "summary": "Check server health", - "operationId": "healthcheck_healthz_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/token_load": { - "get": { - "summary": "Get the current server's load on tokens", - "operationId": "token_load_token_load_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/generate": { - "post": { - "summary": "Generate", - "operationId": "generate_generate_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/generate_stream": { - "post": { - "summary": "Generate Stream", - "operationId": "generate_stream_generate_stream_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/get_score": { - "post": { - "summary": "Get Score", - "operationId": "get_score_get_score_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/": { - "post": { - "summary": "Compat Generate", - "operationId": "compat_generate__post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/v1/chat/completions": { - "post": { - "summary": "Chat Completions", - "operationId": "chat_completions_v1_chat_completions_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ChatCompletionRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ChatCompletionResponse" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } - } - } - }, - "/tokens": { - "get": { - "summary": "Tokens", - "operationId": "tokens_tokens_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - }, - "post": { - "summary": "Tokens", - "operationId": "tokens_tokens_post", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - }, - "/metrics": { - "get": { - "summary": "Metrics", - "operationId": "metrics_metrics_get", - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": {} - } - } - } - } - } - } - }, - "components": { - "schemas": { - "ChatCompletionRequest": { - "title": "ChatCompletionRequest", - "required": [ - "model", - "messages" - ], - "type": "object", - "properties": { - "model": { - "title": "Model", - "type": "string" - }, - "messages": { - "title": "Messages", - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "type": "string" - } - } - }, - "function_call": { - "title": "Function Call", - "type": "string", - "default": "none" - }, - "temperature": { - "title": "Temperature", - "type": "number", - "default": 1 - }, - "top_p": { - "title": "Top P", - "type": "number", - "default": 1.0 - }, - "n": { - "title": "N", - "type": "integer", - "default": 1 - }, - "stream": { - "title": "Stream", - "type": "boolean", - "default": false - }, - "stop": { - "title": "Stop", - "anyOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "type": "string" - } - } - ] - }, - "max_tokens": { - "title": "Max Tokens", - "type": "integer", - "default": 16 - }, - "presence_penalty": { - "title": "Presence Penalty", - "type": "number", - "default": 0.0 - }, - "frequency_penalty": { - "title": "Frequency Penalty", - "type": "number", - "default": 0.0 - }, - "logit_bias": { - "title": "Logit Bias", - "type": "object", - "additionalProperties": { - "type": "number" - } - }, - "user": { - "title": "User", - "type": "string" - }, - "do_sample": { - "title": "Do Sample", - "type": "boolean", - "default": false - }, - "top_k": { - "title": "Top K", - "type": "integer", - "default": -1 - }, - "ignore_eos": { - "title": "Ignore Eos", - "type": "boolean", - "default": false - } - } - }, - "ChatCompletionResponse": { - "title": "ChatCompletionResponse", - "required": [ - "model", - "choices", - "usage" - ], - "type": "object", - "properties": { - "id": { - "title": "Id", - "type": "string" - }, - "object": { - "title": "Object", - "type": "string", - "default": "chat.completion" - }, - "created": { - "title": "Created", - "type": "integer" - }, - "model": { - "title": "Model", - "type": "string" - }, - "choices": { - "title": "Choices", - "type": "array", - "items": { - "$ref": "#/components/schemas/ChatCompletionResponseChoice" - } - }, - "usage": { - "$ref": "#/components/schemas/UsageInfo" - } - } - }, - "ChatCompletionResponseChoice": { - "title": "ChatCompletionResponseChoice", - "required": [ - "index", - "message" - ], - "type": "object", - "properties": { - "index": { - "title": "Index", - "type": "integer" - }, - "message": { - "$ref": "#/components/schemas/ChatMessage" - }, - "finish_reason": { - "title": "Finish Reason", - "enum": [ - "stop", - "length", - "function_call" - ], - "type": "string" - } - } - }, - "ChatMessage": { - "title": "ChatMessage", - "required": [ - "role", - "content" - ], - "type": "object", - "properties": { - "role": { - "title": "Role", - "type": "string" - }, - "content": { - "title": "Content", - "type": "string" - } - } - }, - "HTTPValidationError": { - "title": "HTTPValidationError", - "type": "object", - "properties": { - "detail": { - "title": "Detail", - "type": "array", - "items": { - "$ref": "#/components/schemas/ValidationError" - } - } - } - }, - "UsageInfo": { - "title": "UsageInfo", - "type": "object", - "properties": { - "prompt_tokens": { - "title": "Prompt Tokens", - "type": "integer", - "default": 0 - }, - "completion_tokens": { - "title": "Completion Tokens", - "type": "integer", - "default": 0 - }, - "total_tokens": { - "title": "Total Tokens", - "type": "integer", - "default": 0 - } - } - }, - "ValidationError": { - "title": "ValidationError", - "required": [ - "loc", - "msg", - "type" - ], - "type": "object", - "properties": { - "loc": { - "title": "Location", - "type": "array", - "items": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "integer" - } - ] - } - }, - "msg": { - "title": "Message", - "type": "string" - }, - "type": { - "title": "Error Type", - "type": "string" - } - } - } - } - } -} \ No newline at end of file diff --git a/docs/EN/source/assets/logos/lightllm-logo.png b/docs/EN/source/assets/logos/lightllm-logo.png index 1a9794bf8..5b3b63917 100755 Binary files a/docs/EN/source/assets/logos/lightllm-logo.png and b/docs/EN/source/assets/logos/lightllm-logo.png differ diff --git a/docs/EN/source/dev/performance.rst b/docs/EN/source/dev/performance.rst deleted file mode 100755 index 3ecfb58d5..000000000 --- a/docs/EN/source/dev/performance.rst +++ /dev/null @@ -1,48 +0,0 @@ -.. _Performance_Benchmark: - -Performance -=========== - -Service Performance -------------------- - -We compared the service performance of LightLLM and vLLM==0.1.2 on LLaMA-7B using an A800 with 80G GPU memory. - -To begin, prepare the data as follows: - -.. code-block:: shell - - wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - -Launch the service: - -.. code-block:: shell - - python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto - -Evaluation: - -.. code-block:: shell - - cd test - python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200 - -The performance comparison results are presented below: - -+-------------------+-------------------+ -| vLLM | LightLLM | -+===================+===================+ -| Total time: 361.79| Total time: 188.85| -| Throughput: 5.53 | Throughput: 10.59 | -| requests/s | requests/s | -+-------------------+-------------------+ - -Static Inference Performance ----------------------------- - -For debugging, we offer static performance testing scripts for various models. For instance, you can evaluate the inference performance of the LLaMA model by: - -.. code-block:: shell - - cd test/model - python test_llama.py \ No newline at end of file diff --git a/docs/EN/source/dev/router.rst b/docs/EN/source/dev/router.rst deleted file mode 100755 index 8b7bb9361..000000000 --- a/docs/EN/source/dev/router.rst +++ /dev/null @@ -1,61 +0,0 @@ -.. _Efficient_Router: - -Efficient Router -=================== - -The Efficient Router is introduced to manage incoming requests and dynamically determine whether the request can be fused with a batch that is already running for inference. The merging criterion is to estimate whether the maximum token occupancy in the merged inference process is smaller than the maximum capacity that the hardware can be accommodated. Here, we set this maximum capacity as max_total_token_num. With the support of Token Attention, we can accurately manage the usage of tokens, and can ensure that there is never a risk of out-of-memory (OOM). - -.. image:: ../assets/lightllm/ER1.png - :alt: Efficient_Router1 - :align: center - - -As shown in the figure above, each row represents the current running state of a request, the yellow colour represents the historical kv cache tokens that have been run, each grid represents a token, the grey colour represents the tokens to be generated. The number of tokens to be generated is determined by the maximum output length set for each request and the number of tokens that have been generated. In the above figure, the second row of the green grid represents a newly arrived request, and the figure lists all the requests in ascending order according to the length of the output to be generated. - -If we assume that the new requests are fused into a Batch for inference, the maximum token usage will inevitably occur at one of the time points, Time 1, Time 2, or Time 3. We only need to calculate if the token usage at these three time points does not exceed the max_total_token_num, which indicates that the new request can be added to the Batch for fused inference. - -The total used tokens of Time 1 is equal to the number of yellow cells plus the number of green cells (see the figure below) - -.. image:: ../assets/lightllm/ER2.png - :alt: Efficient_Router1 - :align: center - - -The total used tokens of Time 2 is equal to the number of yellow squares plus the number of green squares (see the figure below) - -.. image:: ../assets/lightllm/ER3.png - :alt: Efficient_Router1 - :align: center - -The total used tokens of Time 3 is equal to the number of yellow squares (see the figure below) - -.. image:: ../assets/lightllm/ER4.png - :alt: Efficient_Router1 - :align: center - -The actual maximum token usage is always one of Time 1, Time 2, or Time 3. - -As long as the maximum token usage during the dynamic inference process is lower than max_total_token_num, it indicates that new requests can be batched for inference. - -To quickly calculate the maximum token usage required for all requests in a batch, we have implemented an efficient example using numpy. - - -.. code-block:: python - - import numpy as np - - def demo(): - max_total_token_num = 100 - req_list = [(5, 4), (4, 3), (5, 3), (3, 2), (4, 2)] # (run_len, left_output_len) - req_list.sort(key=lambda x: -x[1]) - - left_out_len_array = np.array([e[1] for e in req_list]) - has_run_len_array = np.array([e[0] for e in req_list]) - cum_run_len_array = np.cumsum(has_run_len_array) - size_array = np.arange(1, len(req_list) + 1, 1) - need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max() - - if need_max_token_num <= max_total_token_num: - print("ok") - else: - print("oom") \ No newline at end of file diff --git a/docs/EN/source/dev/token_attention.rst b/docs/EN/source/dev/token_attention.rst deleted file mode 100755 index bb2ca24a7..000000000 --- a/docs/EN/source/dev/token_attention.rst +++ /dev/null @@ -1,87 +0,0 @@ -.. _TokenAttention: - -TokenAttention -======================= - -Transformers form the basis of modern large language models. During autoregressive decoding, these models cache key-value tensors of context tokens into GPU memory to facilitate fast generation of the next token. However, these caches occupy significant GPU memory. The unpredictable nature of cache size, due to the variability in the length of each request, exacerbates the issue, resulting in significant memory fragmentation in the absence of a suitable memory management mechanism. - -To alleviate this issue, PagedAttention was proposed to store the KV cache in non-contiguous memory spaces. It partitions the KV cache of each sequence into multiple blocks, with each block containing the keys and values for a fixed number of tokens. This approach effectively controls memory waste within the last block during attention computation. While PagedAttention alleviates memory fragmentation to some extent, it still leaves room for memory waste. Additionally, when handling multiple high-concurrency requests, the allocation and deallocation of memory blocks fall short of efficiency, leading to suboptimal memory utilization. - -To address the above challenges, we introduce TokenAttention, an attention mechanism that manages key and value caching at the token level. Compared to PagedAttention, our TokenAttention not only minimizes memory fragmentation and enables efficient memory sharing but also facilitates efficient memory allocation and deallocation. It allows for more precise and fine-grained memory management, thus optimizing memory utilization. - -.. list-table:: Feature Comparison - :widths: 30 15 15 - :header-rows: 1 - - * - Features - - PagedAttention - - TokenAttention - * - Low memory fragmentation - - ✓ - - ✓ - * - Efficient memory sharing - - ✓ - - ✓ - * - Efficient memory allocation and deallocation - - ✗ - - ✓ - * - Fine-grained memory management - - ✗ - - ✓ - - -The operation mechanism of TokenAttention is illustrated in the figure below: - -.. figure:: ../assets/lightllm/token_attn.gif - :width: 100% - :align: center - :alt: Lightllm - :class: no-scaled-link - - -During model initialization, the KV cache is pre-allocated based on the user-set **max_total_token_num** and a Token Table is created to record the actual storage locations of input tokens. - -When handling new requests, the system first checks for available contiguous space in the pre-allocated Token cache for storing the key-value (KV) cache. TokenAttention favors assigning contiguous graphics memory space for requests to minimize memory access during the inference process. Only when contiguous space is insufficient does it allocate non-contiguous graphics memory for the requests. Since memory management is conducted on a token-by-token basis, TokenAttention achieves nearly zero waste, yielding higher throughput compared to vllm. - -We have implemented an efficient TokenAttention operator using OpenAI Triton. When provided with a query vector, this operator can efficiently retrieve the corresponding KV cache based on the Token Table and conduct the attention computation. - -Upon completion of requests, the corresponding graphics memory can be quickly freed by deleting their records on the Token Table, which makes way for scheduling new requests. Given that TokenAttention pre-allocates all KV cache space during model initialization, it can efficiently release memory for completed requests and merge different batches of requests during dynamic scheduling, thereby effectively maximizing GPU utilization. - -The specific steps are as follows: - - -1. During model initialization, the KV cache is pre-allocated based on the user-set max_total_token_num and a Token Table is created to record the actual storage locations of input tokens. -2. When handling new requests, the system first checks for available contiguous space in the pre-allocated Token cache for storing the key-value (KV) cache. TokenAttention favors assigning contiguous graphics memory space for requests to minimize memory access during the inference process. Only when contiguous space is insufficient does it allocate non-contiguous graphics memory for the requests. The allocated space is recorded in the Token Table for subsequent attention calculations. -3. For cache of newly generated tokens, it is only necessary to find unused space from the pre-allocated token cache and add the corresponding entry to the Token Table. Moreover, to efficiently allocate and release the Cache, we utilize the parallel computing capabilities of torch Tensor on the GPU to manage the state of the pre-allocated Token Cache. First, we define the states as follows: - - .. code-block:: python - - self.mem_state = torch.ones((size,), dtype=torch.bool, device="cuda") - self._mem_cum_sum = torch.empty((size,), dtype=torch.int32, device="cuda") - self.indexes = torch.arange(0, size, dtype=torch.long, device="cuda") - self.can_use_mem_size = size - - - The mem_state records the usage status of the cache, where 1 represents unused and 0 represents used. The _mem_cum_sum is used for the cumulative sum of mem_state which is used to efficiently identify and select unused space for cache allocation. The allocation process is as follows: - - .. code-block:: python - - torch.cumsum(self.mem_state, dim=0, dtype=torch.int32, out=self._mem_cum_sum) - # - select_index = torch.logical_and(self._mem_cum_sum <= need_size, self.mem_state == 1) - select_index = self.indexes[select_index] - self.mem_state[select_index] = 0 - self.can_use_mem_size -= len(select_index) - - - It can be observed that our cache state management is all done on the GPU, fully utilizing the parallel capabilities of torc, thereby allowing the system to efficiently allocate cache space for each request. - -4. Upon completion of requests, the corresponding graphics memory can be quickly freed by deleting their records on the Token Table, which makes way for scheduling new requests. - - .. code-block:: python - - self.can_use_mem_size += free_index.shape[0] - self.mem_state[free_index] = 1 - -5. Token Attention allows for zero wastage of GPU memory, due to its GPU memory management at the token level. It can accurately calculate how many new tokens the system can accommodate for computation. Therefore, when combined with a high-performance router to manage requests, it can continuously add new requests during the inference process, fully utilizing every piece of GPU memory and maximizing GPU utilization. - diff --git a/docs/EN/source/lightllm/lightllm_impl.rst b/docs/EN/source/framework/framework.rst old mode 100755 new mode 100644 similarity index 100% rename from docs/EN/source/lightllm/lightllm_impl.rst rename to docs/EN/source/framework/framework.rst diff --git a/docs/EN/source/framework/router.rst b/docs/EN/source/framework/router.rst new file mode 100644 index 000000000..4f05716ed --- /dev/null +++ b/docs/EN/source/framework/router.rst @@ -0,0 +1,62 @@ +.. _Efficient_Router: + +Efficient Router +================ + +Introducing an efficient router to manage incoming requests and dynamically determine whether the request can be merged with already running inference batches. +The merge criterion is whether the estimated maximum token usage during merged inference is less than the maximum capacity that the hardware can accommodate. +Here, we set this maximum capacity to ``max_total_token_num``. With the support of **Token Attention**, we can accurately manage token usage and ensure that out-of-memory situations never occur. + +.. image:: ../assets/lightllm/ER1.png + :alt: Efficient_Router1 + :align: center + +As shown in the figure above, each row represents the current running state of a request, yellow represents historical kv cache tokens that have already been run, each cell represents a token, and gray represents tokens to be generated. +The number of generated tokens is determined by the maximum output length set for each request and the number of tokens already generated. +In the figure above, the second row of green grid represents a newly arrived request, and all requests are listed in ascending order according to the length of output to be generated. + +If we assume that the new request is merged into a batch for inference, the maximum token usage will necessarily appear at one of time point 1, time 2, or time 3. We only need to calculate whether the token usage at these time points reaches the maximum value. If none of the three time points exceed max_total_token_num, it means the new request can be added to the batch for merged inference. + +Total token usage at time 1 equals the number of yellow cells plus the number of green cells (see figure below) + +.. image:: ../assets/lightllm/ER2.png + :alt: Efficient_Router1 + :align: center + +Total token usage at time 2 equals the number of yellow squares plus the number of green squares (see figure below) + +.. image:: ../assets/lightllm/ER3.png + :alt: Efficient_Router1 + :align: center + +Total token usage at time 3 equals the number of yellow squares (see figure below) + +.. image:: ../assets/lightllm/ER4.png + :alt: Efficient_Router1 + :align: center + +The actual maximum token usage is always one of time 1, time 2, or time 3. + +As long as the maximum token usage during dynamic inference is below max_total_token_num, it means new requests can be batched for inference. + +To quickly calculate the maximum token usage required by all requests in the batch, we implemented an efficient example using numpy. + +.. code-block:: python + + import numpy as np + + def demo(): + max_total_token_num = 100 + req_list = [(5, 4), (4, 3), (5, 3), (3, 2), (4, 2)] # (run_len, left_output_len) + req_list.sort(key=lambda x: -x[1]) + + left_out_len_array = np.array([e[1] for e in req_list]) + has_run_len_array = np.array([e[0] for e in req_list]) + cum_run_len_array = np.cumsum(has_run_len_array) + size_array = np.arange(1, len(req_list) + 1, 1) + need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max() + + if need_max_token_num <= max_total_token_num: + print("ok") + else: + print("oom") \ No newline at end of file diff --git a/docs/EN/source/framework/token_attention.rst b/docs/EN/source/framework/token_attention.rst new file mode 100644 index 000000000..0799f62fd --- /dev/null +++ b/docs/EN/source/framework/token_attention.rst @@ -0,0 +1,83 @@ +.. _TokenAttention: + +TokenAttention +============== + +Transformers form the foundation of modern large language models. During autoregressive decoding, these models cache key-value tensors of context tokens in GPU memory for fast generation of the next token. However, these caches occupy a large amount of GPU memory. Due to the variability in request lengths, the unpredictability of cache sizes exacerbates this problem, leading to severe memory fragmentation in the absence of appropriate memory management mechanisms. + +To alleviate this issue, PagedAttention was proposed to store KV cache in non-contiguous memory spaces. It divides the KV cache of each sequence into multiple blocks, with each block containing keys and values for a fixed number of tokens. This method effectively controls memory waste within the last block during attention computation. While PagedAttention alleviates memory fragmentation to some extent, it still leaves room for memory waste. Additionally, when handling multiple high-concurrency requests, the efficiency of memory block allocation and deallocation is low, resulting in poor memory utilization. + +To address the above challenges, we introduced TokenAttention, an attention mechanism that manages key and value cache at the token level. Compared to PagedAttention, our TokenAttention can not only minimize memory fragmentation and achieve efficient memory sharing, but also promote efficient memory allocation and deallocation. It allows for more precise and fine-grained memory management, thereby optimizing memory utilization. + +.. list-table:: Feature Comparison + :widths: 30 15 15 + :header-rows: 1 + + * - Features + - PagedAttention + - TokenAttention + * - Low Memory Fragmentation + - ✓ + - ✓ + * - Efficient Memory Sharing + - ✓ + - ✓ + * - Efficient Memory Allocation and Deallocation + - ✗ + - ✓ + * - Fine-grained Memory Management + - ✗ + - ✓ + +The operation mechanism of TokenAttention is shown in the figure below: + +.. figure:: ../assets/lightllm/token_attn.gif + :width: 100% + :align: center + :alt: Lightllm + :class: no-scaled-link + +During model initialization, KV cache is pre-allocated according to the user-set **max_total_token_num**, and a Token Table is created to record the actual storage location of input tokens. + +When processing new requests, the system first checks if there is available contiguous space in the pre-allocated token cache for storing key-value (KV) cache. TokenAttention tends to allocate contiguous graphics memory space for requests to minimize memory access during inference. Only when contiguous space is insufficient will non-contiguous memory be allocated for requests. Since memory management is performed token by token, TokenAttention achieves almost zero waste and produces higher throughput compared to vllm. + +We implemented an efficient TokenAttention operator using OpenAI Triton. When provided with query vectors, this operator can efficiently retrieve the corresponding KV cache based on the Token Table and perform attention computation. + +After request completion, the corresponding memory can be quickly released by deleting records on the token table, making way for scheduling new requests. Since TokenAttention pre-allocates all KV cache space during model initialization, it can efficiently release memory for completed requests and merge requests from different batches during dynamic scheduling, effectively maximizing GPU utilization. + +Specific steps are as follows: + +1. During model initialization, the system pre-allocates KV cache memory according to the user-set ``max_total_token_num`` and creates a Token Table to record the actual storage location of input tokens. + +2. When processing new requests, the system first checks if there is available contiguous space in the pre-allocated token cache for storing KV Cache. TokenAttention tends to allocate contiguous memory for requests to minimize memory access during inference. Only when contiguous space is insufficient will non-contiguous memory be allocated for requests. The allocated space is recorded in the Token Table for subsequent attention computation. + +3. For caching newly generated tokens, it's only necessary to find unused space from the pre-allocated token cache and add the corresponding entries to the token table. Additionally, to efficiently allocate and deallocate cache, we utilize Torch Tensor's parallel computing capabilities on GPU to manage the state of pre-allocated token cache. First, we define the state as follows: + + .. code-block:: python + + self.mem_state = torch.ones((size,), dtype=torch.bool, device="cuda") + self._mem_cum_sum = torch.empty((size,), dtype=torch.int32, device="cuda") + self.indexes = torch.arange(0, size, dtype=torch.long, device="cuda") + self.can_use_mem_size = size + + ``mem_state`` records the usage state of the cache, where 1 represents unused and 0 represents used. ``_mem_cum_sum`` is used for the cumulative sum of ``mem_state``, used to efficiently identify and select unused space for cache allocation. The allocation process is as follows: + + .. code-block:: python + + torch.cumsum(self.mem_state, dim=0, dtype=torch.int32, out=self._mem_cum_sum) + # + select_index = torch.logical_and(self._mem_cum_sum <= need_size, self.mem_state == 1) + select_index = self.indexes[select_index] + self.mem_state[select_index] = 0 + self.can_use_mem_size -= len(select_index) + + It can be observed that our cache state management is entirely completed on GPU, fully utilizing torch's parallel capabilities, thereby allowing the system to efficiently allocate cache space for each request. + +4. After request completion, the corresponding memory can be quickly released by deleting records on the ``Token Table``, making way for scheduling new requests. + + .. code-block:: python + + self.can_use_mem_size += free_index.shape[0] + self.mem_state[free_index] = 1 + +5. Due to token-level GPU memory management, TokenAttention can achieve zero waste of GPU memory. It can accurately calculate how many new tokens the system can accommodate for computation. Therefore, when combined with ``Efficient Router`` to manage requests, it can continuously add new requests during inference, fully utilizing every piece of GPU memory and maximizing GPU utilization. \ No newline at end of file diff --git a/docs/EN/source/getting_started/benchmark.rst b/docs/EN/source/getting_started/benchmark.rst new file mode 100644 index 000000000..87caaa06a --- /dev/null +++ b/docs/EN/source/getting_started/benchmark.rst @@ -0,0 +1,199 @@ +Benchmark Testing Guide +======================= + +LightLLM provides multiple performance testing tools, including service performance testing and static inference performance testing. This document will detailedly introduce how to use these tools for performance evaluation. + +Service Performance Testing (Service Benchmark) +---------------------------------------------- + +Service performance testing is mainly used to evaluate LightLLM's performance in real service scenarios, including key metrics such as throughput and latency. + +QPS Testing (benchmark_qps.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +QPS (Queries Per Second) testing is the core tool for evaluating service performance, supporting LightLLM and OpenAI compatible API formats. + +**Usage:** + +.. code-block:: bash + + python test/benchmark/service/benchmark_qps.py \ + --url http://127.0.0.1:8000/generate_stream \ + --tokenizer_path /path/to/tokenizer \ + --num_clients 100 \ + --input_num 2000 \ + --input_qps 30.0 \ + --input_len 1024 \ + --output_len 128 \ + --server_api lightllm \ + --dump_file results.json + +**Main Parameter Description:** + +- ``--url``: Service address, supports LightLLM and OpenAI formats +- ``--tokenizer_path``: Tokenizer path +- ``--input_num``: Total number of test requests +- ``--input_qps``: Input QPS limit +- ``--input_len``: Input sequence length +- ``--output_len``: Output sequence length +- ``--server_api``: Service API type (lightllm/openai) +- ``--data_path``: Custom dataset path +- ``--continuous_send``: Whether to send continuously (0/1) +- ``--force_terminate``: Force termination mode (0/1) + +**Output Metrics:** + +- Total QPS: Overall queries per second +- Sender QPS: Sender QPS +- Avg Input Length: Average input length +- Avg Output Length: Average output length +- Total Throughput: Overall throughput (token/s) +- Input Throughput: Input throughput (token/s) +- Output Throughput: Output throughput (token/s) +- request_time P{25,50,75,90,95,99,100}: Request latency percentiles +- first_token_time P{25,50,75,90,95,99,100}: First token latency percentiles +- decode_token_time P{25,50,75,90,95,99,100}: Decode token latency percentiles + +Fixed Concurrency Testing (benchmark_client.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Used to evaluate performance under different client concurrency levels. + +**Usage:** + +.. code-block:: bash + + python test/benchmark/service/benchmark_client.py \ + --url http://127.0.0.1:8000/generate_stream \ + --tokenizer_path /path/to/tokenizer \ + --num_clients 100 \ + --input_num 2000 \ + --input_len 1024 \ + --output_len 128 \ + --server_api lightllm + +ShareGPT Dataset Testing (benchmark_sharegpt.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Performance testing using ShareGPT real conversation data. + +**Usage:** + +.. code-block:: bash + + $ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +.. code-block:: bash + + python test/benchmark/service/benchmark_sharegpt.py \ + --dataset /path/to/sharegpt_dataset.json \ + --tokenizer /path/to/tokenizer \ + --num_prompts 1000 \ + --request_rate 10.0 + +**Main Parameters:** + +- ``--dataset``: ShareGPT format dataset path +- ``--tokenizer``: Tokenizer path +- ``--num_prompts``: Number of test prompts +- ``--request_rate``: Request rate (requests/s) + +Prompt Cache Testing +~~~~~~~~~~~~~~~~~~~ + +Evaluate prompt cache performance under different hit rates by adjusting --first_input_len, --output_len --subsequent_input_len to control hit rate. +Hit rate per round = (first_input_len + (output_len + subsequent_input_len) * (num_turns - 1)) / (first_input_len + (output_len + subsequent_input_len) * num_turns) +Note: Control concurrency and user numbers based on max_total_token_num to ensure all requests can fit, guaranteeing that the actual hit rate matches your preset hit rate. + +.. code-block:: bash + + python test/benchmark/service/benchmark_prompt_cache.py \ + --model_url http://127.0.0.1:8000/generate_stream \ + --model_name model \ + --num_workers 10 \ + --first_input_len 512 \ + --subsequent_input_len 512 \ + --output_len 128 \ + --num_turns 10 \ + --num_users 10 + +Parameter Description: + +- ``--model_url``: Service address +- ``--model_name``: Result save filename +- ``--num_workers``: Concurrency number +- ``--first_input_len``: First round input length +- ``--subsequent_input_len``: Subsequent round input length +- ``--output_len``: Output length +- ``--num_turns``: Number of rounds +- ``--num_users``: Number of users + +Static Inference Performance Testing (Static Inference Benchmark) +--------------------------------------------------------------- + +Static inference testing is used to evaluate model inference performance under fixed input conditions, mainly evaluating operator quality. + +Model Inference Testing (model_infer.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Main Features:** + +- Supports prefill and decode stage performance testing +- Supports microbatch overlap optimization +- Supports multi-GPU parallel inference +- Provides detailed throughput statistics + +**Usage:** + +.. code-block:: bash + + python test/benchmark/static_inference/test_model.py \ + --model_dir /path/to/model \ + --batch_size 32 \ + --input_len 1024 \ + --output_len 128 \ + --tp 2 \ + --data_type bf16 + +**Main Parameters:** + +- ``--model_dir``: Model path +- ``--batch_size``: Batch size +- ``--input_len``: Input sequence length +- ``--output_len``: Output sequence length +- ``--tp``: Tensor Parallel degree +- ``--data_type``: Data type (bf16/fp16/fp32) +- ``--enable_prefill_microbatch_overlap``: Enable prefill microbatch overlap, only applicable to DeepSeek model EP mode +- ``--enable_decode_microbatch_overlap``: Enable decode microbatch overlap, only applicable to DeepSeek model EP mode +- ``--torch_profile``: Enable torch profiler for performance analysis + +.. note:: + Complete startup parameters are not listed here. Static testing scripts also share Lightllm's startup parameters. For more startup configurations, please refer to :ref:`tutorial/api_server_args_zh`. + +**Output Metrics:** + +- Prefill stage throughput (tokens/s) +- Decode stage throughput (tokens/s) +- Latency statistics for each stage + +Multi-Token Prediction Performance Testing (model_infer_mtp.py) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Multi-token prediction static performance testing with 100% acceptance rate by default, used to evaluate the ultimate performance of multi-token prediction. Currently only supports DeepSeek series models. + +**Usage:** + +.. code-block:: bash + + python test/benchmark/static_inference/test_model.py \ + --model_dir /path/to/main_model \ + --mtp_mode deepseekv3 \ + --mtp_step 1 \ + --mtp_draft_model_dir /path/to/draft_model \ + --batch_size 32 \ + --input_len 1024 \ + --output_len 128 + +Parameter Description: + +- ``--model_dir``: Main model path \ No newline at end of file diff --git a/docs/EN/source/getting_started/faq.rst b/docs/EN/source/getting_started/faq.rst deleted file mode 100644 index 74845e1a0..000000000 --- a/docs/EN/source/getting_started/faq.rst +++ /dev/null @@ -1,15 +0,0 @@ -.. _faq: - -- The LLaMA tokenizer fails to load. - - Consider resolving this by running the command: - - .. code-block:: shell - - pip install protobuf==3.20.0 - -- ``error : PTX .version 7.4 does not support .target sm_89`` - - Launch with: - - .. code-block:: shell - - bash tools/resolve_ptx_version python -m lightllm.server.api_server ... \ No newline at end of file diff --git a/docs/EN/source/getting_started/installation.rst b/docs/EN/source/getting_started/installation.rst index a76671d4d..35b398287 100755 --- a/docs/EN/source/getting_started/installation.rst +++ b/docs/EN/source/getting_started/installation.rst @@ -1,62 +1,60 @@ .. _installation: -Installation -============ +Installation Guide +================== -Lightllm is a Python-based inference framework, with operators implemented in Triton. +Lightllm is a pure Python-based inference framework with operators written in Triton. -Requirements ------------- +Environment Requirements +------------------------ * Operating System: Linux * Python: 3.9 -* GPU: Compute Capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.). - +* GPU: Compute Capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) .. _build_from_docker: -Installing with Docker +Installation via Docker ----------------------- -The easiest way to install Lightllm is by using the official image. You can directly pull and run the official image: +The easiest way to install Lightllm is using the official image. You can directly pull the official image and run it: .. code-block:: console $ # Pull the official image $ docker pull ghcr.io/modeltc/lightllm:main $ - $ # Run the image + $ # Run $ docker run -it --gpus all -p 8080:8080 \ - $ --shm-size 32g -v your_local_path:/data/ \ + $ --shm-size 1g -v your_local_path:/data/ \ $ ghcr.io/modeltc/lightllm:main /bin/bash -You can also manually build and run the image from the source: - +You can also manually build the image from source and run it: .. code-block:: console $ # Manually build the image $ docker build -t . $ - $ # Run the image + $ # Run $ docker run -it --gpus all -p 8080:8080 \ - $ --shm-size 32g -v your_local_path:/data/ \ + $ --shm-size 1g -v your_local_path:/data/ \ $ /bin/bash -Alternatively, you can use a script to automatically build and run the image: - +Or you can directly use the script to launch the image and run it with one click: .. code-block:: console - + $ # View script parameters $ python tools/quick_launch_docker.py --help .. note:: - If you are using multiple GPUs, you may need to increase the --shm-size parameter setting above. + If you use multiple GPUs, you may need to increase the --shm-size parameter setting above. If you need to run DeepSeek models in EP mode, please use the image + ghcr.io/modeltc/lightllm:main-deepep. .. _build_from_source: -Installing from Source ------------------------ +Installation from Source +------------------------ You can also install Lightllm from source: @@ -66,23 +64,31 @@ You can also install Lightllm from source: $ conda create -n lightllm python=3.9 -y $ conda activate lightllm $ - $ # Download the latest source code for Lightllm + $ # Download the latest Lightllm source code $ git clone https://github.com/ModelTC/lightllm.git $ cd lightllm $ - $ # Install Lightllm's dependencies + $ # Install Lightllm dependencies (cuda 12.4) $ pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu124 $ $ # Install Lightllm $ python setup.py install -NOTE: If you are using torch with cuda 11.x instead, run `pip install nvidia-nccl-cu12==2.20.5` to support torch cuda graph. +NOTE: If you use torch with cuda 11.x for some reason, please run `pip install nvidia-nccl-cu12==2.20.5` to support torch cuda graph. .. note:: - The Lightllm code has been tested on various GPUs, including V100, A100, A800, 4090, and H800. - If you are using A100, A800, or similar GPUs, it is recommended to install triton==3.1.0: + Lightllm code has been tested on various GPUs including V100, A100, A800, 4090, and H800. + If you use A100, A800 and other graphics cards, it is recommended to install triton==3.0.0: + + .. code-block:: console + + $ pip install triton==3.0.0 --no-deps + + If you use H800, V100 and other graphics cards, it is recommended to install triton-nightly: .. code-block:: console - $ pip install triton==3.1.0 --no-deps \ No newline at end of file + $ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly --no-deps + + For specific reasons, please refer to: `issue `_ and `fix PR `_ \ No newline at end of file diff --git a/docs/EN/source/getting_started/quickstart.rst b/docs/EN/source/getting_started/quickstart.rst index bf2335658..2a8b9ad42 100755 --- a/docs/EN/source/getting_started/quickstart.rst +++ b/docs/EN/source/getting_started/quickstart.rst @@ -3,27 +3,26 @@ Quick Start =========== -Deploying a model with Lightllm is very straightforward and requires only two steps: +Deploying models with Lightllm is very simple, requiring only two steps at minimum: -1. Prepare the weight file for a model supported by Lightllm. -2. Start the model service using the command line. +1. Prepare model weight files supported by Lightllm. +2. Use command line to start the model service. 3. (Optional) Test the model service. .. note:: - Before continuing with this tutorial, please ensure you have completed the :ref:`installation guide `. + Before continuing with this tutorial, please ensure you have completed the :ref:`Installation Guide `. -1. Prepare the Model File -------------------------- +1. Prepare Model Files +---------------------- -The following content will demonstrate Lightllm's support for large language models using `Llama-2-7b-chat `_. You can refer to the article: `How to Quickly Download Hugging Face Models — A Summary of Methods `_ for methods to download models. +Download `Qwen3-8B `_ first. +Below is an example code for downloading the model: -Here is an example of how to download the model: - -(1) (Optional) Create a directory +(1) (Optional) Create folder .. code-block:: console - $ mkdir -p ~/models && cd ~/models + $ mkdirs ~/models && cd ~/models (2) Install ``huggingface_hub`` @@ -31,132 +30,37 @@ Here is an example of how to download the model: $ pip install -U huggingface_hub -(3) Download the model file +(3) Download model files .. code-block:: console - $ huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir Llama-2-7b-chat - -.. tip:: - The above code for downloading the model requires a stable internet connection and may take some time. You can use alternative download methods or other supported models as substitutes. For the latest list of supported models, please refer to the `project homepage `_. - + $ huggingface-cli download Qwen/Qwen3-8B --local-dir Qwen3-8B -2. Start the Model Service ---------------------------- +2. Start Model Service +---------------------- -After downloading the Llama-2-7b-chat model, use the following command in the terminal to deploy the API service: +After downloading the Qwen3-8B model, use the following code in the terminal to deploy the API service: .. code-block:: console - $ python -m lightllm.server.api_server --model_dir ~/models/Llama-2-7b-chat + $ python -m lightllm.server.api_server --model_dir ~/models/Qwen3-8B .. note:: - The ``--model_dir`` parameter in the above command should be changed to the actual path of your model on your machine. - -For the DeepSeek-R1 model on single H200, it can be launched with the following command: - -.. code-block:: console - - $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 8 --graph_max_batch_size 100 - -.. note:: - LOADWORKER specifies the thread for model loading, which can enhance the speed of model loading. The --graph_max_batch_size parameter specifies the number of cudagraphs to be captured, which will capture graphs for batch sizes ranging from 1 to 100. - -For the DeepSeek-R1 model on two H100, it can be launched with the following command: - -.. code-block:: console - - $ # Node 0 - $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 16 --graph_max_batch_size 100 --nccl_host master_addr --nnodes 2 --node_rank 0 - $ # Node 1 - $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 16 --graph_max_batch_size 100 --nccl_host master_addr --nnodes 2 --node_rank 1 - -3. Start Model Service - Disaggregating Prefill and Decoding ------------------------------------------------------------- - -Find Local IP - -.. code-block:: console - - $ hostname -i - -Run MPS (Optional) - -.. code-block:: console - - $ nvidia-cuda-mps-control -d - -Run pd_master Service - -.. code-block:: console - - $ CUDA_VISIBLE_DEVICES=0 python -m lightllm.server.api_server \ - $ --model_dir /your/model/path \ - $ --run_mode "pd_master" \ - $ --host /your/host/ip \ - $ --port 60011 - -Open a new terminal and run the prefill service - -.. code-block:: console - - $ CUDA_VISIBLE_DEVICES=0,1 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /data/fengdahu/model/Qwen2-7B/ \ - $ --run_mode "prefill" \ - $ --host /your/host/ip \ - $ --port 8017 \ - $ --tp 2 \ - $ --nccl_port 2732 \ - $ --max_total_token_num 400000 \ - $ --tokenizer_mode fast \ - $ --pd_master_ip /your/host/ip \ - $ --pd_master_port 60011 \ - $ --max_req_total_len 16000 \ - $ --running_max_req_size 128 \ - $ --disable_cudagraph - -Open a new terminal and run the decoding service - -.. code-block:: console - - $ CUDA_VISIBLE_DEVICES=2,3 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /data/fengdahu/model/Qwen2-7B/ \ - $ --run_mode "decode" \ - $ --host /your/host/ip \ - $ --port 8118 \ - $ --nccl_port 12322 \ - $ --tp 2 \ - $ --max_total_token_num 400000 \ - $ --graph_max_len_in_batch 2048 \ - $ --graph_max_batch_size 16 \ - $ --tokenizer_mode fast \ - $ --pd_master_ip /your/host/ip \ - $ --pd_master_port 60011 - -.. note:: - The tp size for the prefill and decoding stages should remain consistent. - -4. (Optional) Test the Model Service --------------------------------------- - -In a new terminal, use the following command to test the model service: - -.. code-block:: console - - $ curl http://server_ip:server_port/generate \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "inputs": "What is AI?", - $ "parameters":{ - $ "max_new_tokens":17, - $ "frequency_penalty":1 - $ } - $ }' - + The ``--model_dir`` parameter in the above code needs to be modified to your actual local model path. -For DeepSeek-R1 benchmark, use the following command to test the model service: +3. Test Model Service +--------------------- .. code-block:: console - $ cd test - $ python benchmark_client.py --num_clients 100 --input_num 2000 --tokenizer_path /nvme/DeepSeek-R1/ --url http://127.0.01:8000/generate_stream + $ curl http://127.0.0.1:8000/generate \ + -H "Content-Type: application/json" \ + -d '{ + "inputs": "What is AI?", + "parameters":{ + "max_new_tokens":17, + "frequency_penalty":1 + } + }' diff --git a/docs/EN/source/index.rst b/docs/EN/source/index.rst index 81a0e490d..db4f1d3d1 100755 --- a/docs/EN/source/index.rst +++ b/docs/EN/source/index.rst @@ -1,5 +1,5 @@ -Welcome Lightllm! -================== +Welcome to Lightllm! +==================== .. figure:: ./assets/logos/lightllm-logo.png :width: 100% @@ -10,7 +10,7 @@ Welcome Lightllm! .. raw:: html

- A Light and Fast inference Services for LLM + A Lightweight and High-Performance Large Language Model Service Framework

@@ -22,94 +22,52 @@ Welcome Lightllm!

-LightLLM is a Python-based LLM (Large Language Model) inference and serving framework, notable for its lightweight design, easy scalability, and high-speed performance. LightLLM harnesses the strengths of numerous well-regarded open-source implementations, including but not limited to FasterTransformer, TGI, vLLM, and FlashAttention. - -**Features**: - -* Tri-process asynchronous collaboration: tokenization, model inference, and detokenization are performed asynchronously, leading to a considerable improvement in GPU utilization. -* Nopad (Unpad): offers support for nopad attention operations across multiple models to efficiently handle requests with large length disparities. -* Dynamic Batch: enables dynamic batch scheduling of requests -* FlashAttention: incorporates FlashAttention to improve speed and reduce GPU memory footprint during inference. -* Tensor Parallelism: utilizes tensor parallelism over multiple GPUs for faster inference. -* Token Attention: implements token-wise's KV cache memory management mechanism, allowing for zero memory waste during inference. -* High-performance Router: collaborates with Token Attention to meticulously manage the GPU memory of each token, thereby optimizing system throughput. -* Int8KV Cache: This feature will increase the capacity of tokens to almost twice as much. only llama support. - -**Supported Model List**: - -- `BLOOM `_ -- `LLaMA `_ -- `LLaMA V2 `_ -- `StarCoder `_ -- `Qwen-7b `_ -- `ChatGLM2-6b `_ -- `Baichuan-7b `_ -- `Baichuan2-7b `_ -- `Baichuan2-13b `_ -- `Baichuan-13b `_ -- `InternLM-7b `_ -- `Yi-34b `_ -- `Qwen-VL `_ -- `Qwen-VL-Chat `_ -- `Llava-7b `_ -- `Llava-13b `_ -- `Mixtral `_ -- `Stablelm `_ -- `MiniCPM `_ -- `Phi-3 `_ -- `CohereForAI `_ -- `DeepSeek-V2-Lite `_ -- `DeepSeek-V2 `_ - - -Docs List -------------- +Lightllm is a pure Python-based large language model inference and serving framework, featuring lightweight design, easy extensibility, and high performance. +Lightllm integrates the advantages of numerous open-source solutions, including but not limited to FasterTransformer, TGI, vLLM, SGLang, and FlashAttention. -.. toctree:: - :maxdepth: 1 - :caption: Getting started - - getting_started/installation - getting_started/quickstart +**Key Features**: -.. toctree:: - :maxdepth: 1 - :caption: Lightllm +* Multi-process Collaboration: Input text encoding, language model inference, visual model inference, and output decoding are performed asynchronously, significantly improving GPU utilization. +* Cross-process Request Object Sharing: Through shared memory, cross-process request object sharing is achieved, reducing inter-process communication latency. +* Efficient Scheduling Strategy: Peak memory scheduling strategy with prediction, maximizing GPU memory utilization while reducing request eviction. +* High-performance Inference Backend: Efficient operator implementation, support for multiple parallelization methods (tensor parallelism, data parallelism, and expert parallelism), dynamic KV cache, rich quantization support (int8, fp8, int4), structured output, and multi-result prediction. - lightllm/lightllm_intro - lightllm/lightllm_impl +Documentation List +------------------ .. toctree:: :maxdepth: 1 - :caption: Model + :caption: Quick Start - Supported Model - Examples - Add new models + Installation Guide + Quick Start + Performance Benchmark .. toctree:: :maxdepth: 1 - :caption: Launching Server - - Server Args - Benchmark - - + :caption: Deployment Tutorials + + DeepSeek R1 Deployment + Multimodal Deployment + Reward Model Deployment + OpenAI api Usage + APIServer Parameters + Lightllm API Introduction + .. toctree:: :maxdepth: 1 - :caption: Using Server + :caption: Model Support - user/api_param - user/openapi_docs - + Supported Models List + Adding New Models .. toctree:: :maxdepth: 1 - :caption: development docs + :caption: Architecture Introduction - dev/token_attention - dev/router - dev/performance + Architecture Overview + Token Attention + Efficient Router .. Indices and tables .. ================== diff --git a/docs/EN/source/lightllm/lightllm_intro.rst b/docs/EN/source/lightllm/lightllm_intro.rst deleted file mode 100644 index a073d5941..000000000 --- a/docs/EN/source/lightllm/lightllm_intro.rst +++ /dev/null @@ -1,78 +0,0 @@ -.. _lightllm: - -LightLLM Overview -=========================== - -With the popularity of ChatGPT, large language model, abbreviated as LLM, has received increasing attention. The emergence of such models has greatly improved people's work efficiency. However, the key to further widespread adoption lies in how to deploy models with billons of parameters at low cost and high throughput. To improve the throughput of large model services and enable more interested researchers to quickly get involved, a lightweight LLM inference service framework called LightLLM has emerged. LightLLM introduces a more fine-grained kv cache management algorithm called TokenAttention and designs an Efficient Router scheduling implementation that works efficiently with TokenAttention. Through the interaction of TokenAttention and Efficient Router, LightLLM achieves higher throughput than vLLM and Text Generation Inference in most scenarios, with performance improvements of around 4 times in some cases. LightLLM is flexible, user-friendly, and efficient. Interested friends may want to click on the link below to try it out. - -Project:https://github.com/ModelTC/lightllm - -.. _challenge: - -The challenge of LLM Serving -------------------------------- - -Large language models have garnered significant attention from researchers due to their excellent performance. These models not only engage in everyday conversations with humans but also assist in completing various daily tasks, thereby enhancing productivity. However, despite the remarkable performance demonstrated by these models, deploying large-scale models to improve service performance poses the following challenges: - -* **Severe fragmentation of memory**: Network weights ranging from tens to hundreds of gigabytes, as well as the constantly dynamic growing KV Cache during inference, easily leads to low memory utilization. -* **Low efficiency in request scheduling**: The length of requests dynamically changes over time, which can result in GPU idling or low utilization issues. -* **High difficulty in kernel customization**: Customizing kernels for networks is necessary to efficiently utilize memory and improve service throughput. However, it will require a significant amount of effort from researchers. - -.. _solutions_and_problems: - -Existing solutions and problems -------------------------------------- - -To address the aforementioned challenges, many excellent LLM inference frameworks have emerged, such as FasterTransformer, Text-Generation-Inference (referred to as TGI), vLLM, etc. The core features and capability matrices of these frameworks are shown in the table below: - -.. list-table:: Comparison of various frameworks - :header-rows: 1 - - * - Framework - - NV Triton + FasterTransformer - - TGI - - vLLM - - LightLLM - * - core feature - - Efficient kernel - - `Continuous batch `_, Token streaming - - `PageAttention `_ - - Tri-process asynchronous collaboration,:ref:`TokenAttention`,:ref:`Efficient_Router` - * - Memory fragmentation - - low - - high - - low - - low - * - Request scheduling efficiency - - low - - middle - - middle - - high - * - Difficulty of kernel customization - - high - - middle - - middle - - low - -These frameworks all have their own unique features. For example, FasterTransformer has excellent static inference performance but lacks robust service scheduling and is primarily developed in C++, resulting in high secondary development costs. TGI has excellent service interfaces and scheduling features such as Continuous Batch, but its inference performance, scheduling strategy, and memory management have some shortcomings. vLLM has excellent memory management but lacks efficiency in request scheduling, and its overall implementation details are more suitable for deploying small models. - -Lightllm ----------------------- - -Therefore, to address these issues, we have developed a LLM deployment framework called LightLLM, which is based on the pure Python language. It enables researchers to easily deploy and customize lightweight models locally, allowing for rapid expansion of different models and integration of various excellent open-source features. The core features of LightLLM are as follows: - -* Tri-process asynchronous collaboration: tokenization, model inference, and detokenization are performed asynchronously, leading to a considerable improvement in GPU utilization. -* :ref:`TokenAttention`: implements token-wise's KV cache memory management mechanism, allowing for zero memory waste during inference. -* :ref:`Efficient_Router`: collaborates with Token Attention to meticulously manage the GPU memory of each token, thereby optimizing system throughput. - -With the highly coordinated efficient kernels developed based on OpenAI Triton and service scheduling, LightLLM achieves excellent throughput performance - -.. figure:: ../assets/lightllm/arch.png - :width: 100% - :align: center - :alt: Lightllm - :class: no-scaled-link - - - -LightLLM is committed to enabling more people to participate, allowing flexible and efficient exploration of various LLM deployment and inference solutions. It also serves as a reference for hardware manufacturers to promote the development of the field. We hope that everyone can give it more stars, fork the project, and contribute. We believe that in the future, more technologies and solutions (such as TensorRT) will emerge, continuously reducing deployment costs and making AGI more accessible to ordinary households. \ No newline at end of file diff --git a/docs/EN/source/models/add_new_model.md b/docs/EN/source/models/add_new_model.md index cc819864f..6127dffaf 100755 --- a/docs/EN/source/models/add_new_model.md +++ b/docs/EN/source/models/add_new_model.md @@ -1,25 +1,25 @@ # How to Add New Model Support -## 1. Introduction of inference architecture +## 1. Current Inference Architecture Introduction -In the lightllm/common/basemodel directory, you will find the base class implementation for the entire inference architecture. +Under the ***lightllm/common/basemodel*** directory is the base class implementation of the entire inference architecture ~~~shell -├── basemodel.py # Model architecture class -├── infer_struct.py # State class for inference +├── basemodel.py # Model framework class +├── infer_struct.py # Inference state class ├── __init__.py -├── layer_infer # Inference layer base class +├── layer_infer # Base class implementation of inference layers │ ├── base_layer_infer.py │ ├── __init__.py │ ├── post_layer_infer.py │ ├── pre_layer_infer.py -│ ├── template # Template implementation of the inference layer. +│ ├── template # Template implementation of inference layers, inheriting from templates can reduce development effort and duplicate code │ │ ├── __init__.py │ │ ├── post_layer_infer_template.py │ │ ├── pre_layer_infer_template.py │ │ └── transformer_layer_infer_template.py │ └── transformer_layer_infer.py -├── layer_weights # base class of weight +├── layer_weights # Weight base class implementation │ ├── base_layer_weight.py │ ├── hf_load_utils.py │ ├── __init__.py @@ -31,41 +31,41 @@ In the lightllm/common/basemodel directory, you will find the base class impleme └── __init__.py ~~~ -As shown above, the current model inference architecture mainly consists of two parts: weight and inference. +As shown above, the current model inference architecture mainly consists of two parts: weights and inference. -### Weight +### Weights -The layer_weights directory contains weight-related codes. In theory, a newly added model needs to inherit the PreAndPostLayerWeight and TransformerLayerWeight classes in pre_and_post_layer_weight.py and transformer_layer_weight.py to load weights. +Under the layer_weights directory is the weight-related code. Theoretically, for a newly added model, you need to inherit and implement the PreAndPostLayerWeight and TransformerLayerWeight classes in pre_and_post_layer_weight.py and transformer_layer_weight.py to implement weight loading. -| Weight base class | Responsibilities | -| ---------------------- | ------------------------------------------------------------ | -| PreAndPostLayerWeight | Responsible for loading the weights of the first Embedding layer and the last post-processing layer of the LLM model and splitting the weights according to the tp parameters used | -| TransformerLayerWeight | Responsible for loading the weights of the LLM model transformer layer and splitting the weights according to the tp parameters used | +| Weight Base Class | Responsibilities | +| ---------------------------- | ------------------------------------------------------------ | +| PreAndPostLayerWeight | Responsible for loading weights of the first Embedding layer and the last post-processing layer of LLM models, and splitting weights according to the tp parameter used | +| TransformerLayerWeight | Responsible for loading weights of transformer layers of LLM models and splitting weights according to the tp parameter used | ### Inference -The layer_infer directory contains the base classes for inference processing, and some templates are provided in the template directory. Inheriting from the template class can reduce some unnecessary duplication of code and simplify the implementation. There are three inference classes that need to be inherited in this directory. +Under the layer_infer directory are the relevant base classes for inference processing, and some templates are provided under the template directory. Inheriting from template classes can reduce some unnecessary duplicate code and simplify implementation. There are three inference classes that need to be inherited and implemented under this directory. -| Inference base class | Responsibilities | -| --------------------- | ------------------------------------------ | -| PreLayerInfer | Responsible for inference of the Embedding layer | -| TransformerLayerInfer | Responsible for inference of th transformer layer | -| PostLayerInfer | Responsible for inference of converting the final hidden layer output of the network into logits | +| Inference Base Class | Responsibilities | +| ---------------------------- | -------------------------------------------- | +| PreLayerInfer | Responsible for inference of Embedding layer | +| TransformerLayerInfer | Responsible for inference of transformer layer | +| PostLayerInfer | Responsible for converting the final hidden layer output of the network to logits inference | -The base class BaseLayerInfer of the above three classes provides two most important external service function interfaces. All inference behaviors will be entered through these two interfaces. +The base class BaseLayerInfer of the above three classes provides two most important external service function interfaces. All inference behaviors will enter through these two interfaces. -| interface | Responsibilities | -| ------------------------------------------------------------ | ---------------------------------------------- | -| def context_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: BaseLayerWeight): | the first inference of batch(prefill) | -| def token_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: BaseLayerWeight): | the inference of decode | +| Interface | Responsibilities | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| def context_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: BaseLayerWeight): | First inference of batch (also called prefill in code) | +| def token_forward(self, input_ids, infer_state: InferStateInfo, layer_weight: BaseLayerWeight): | Single step decode stage inference | -### Operator +### Operators -The triton_kernel directory contains some operators needed for inference implemented using openai triton. +Under the triton_kernel directory are some operators needed for inference implemented using OpenAI triton. -### State class +### State Class -The InferStateInfo class in infer_struct.py is a state class that passes some important information between layers when performing a model inference. Different models can inherit and implement this class to add unique state information that each model needs to pass. The InferStateInfo class provides an inherited init_some_extra_state interface for initializing the transmission of additional unique information. +The InferStateInfo class in infer_struct.py is a state class that passes some important information between layers during a model inference. Different models can inherit and implement this class to add unique state information that each model needs to pass. The InferStateInfo class provides an inheritable init_some_extra_state interface for initializing additional unique information. ~~~python def init_some_extra_state(self, @@ -81,9 +81,9 @@ The InferStateInfo class in infer_struct.py is a state class that passes some im pass ~~~ -### Model class +### Model Framework Class -The TpPartBaseModel class in basemodel.py is the entry point of the entire model. Each type of model needs to inherit and implement this class. This class uses the inference class, weight class, and state class to complete the model loading and inference functions in a similar way to building blocks. Many of its interfaces can be inherited and implemented to complete the unique operations of each model type. +The TpPartBaseModel class in basemodel.py is the entry point of the entire model. Each type of model needs to inherit and implement this class. This class uses inference classes, weight classes, and state classes in a building block-like manner to complete model loading and inference functions. There are many interfaces that can be inherited and implemented to complete unique operations for each model type. ~~~python class TpPartBaseModel: @@ -99,9 +99,7 @@ class TpPartBaseModel: # infer state class infer_state_class = InferStateInfo - def __init__(self, tp_rank, world_size, weight_dir, max_total_token_num, load_way="HF", mode=[]): - self.tp_rank_ = tp_rank - self.tp_world_size_ = world_size + def __init__(self, weight_dir, max_total_token_num, load_way="HF", mode=[]): self.weight_dir_ = weight_dir self.max_total_token_num = max_total_token_num self.load_way = load_way @@ -120,21 +118,21 @@ class TpPartBaseModel: ... ~~~ -Common interfaces that need to be inherited and implemented +Commonly used interfaces that need to be inherited and implemented -| interfaces | effect | +| Interface | Function | | ---------------------------- | ------------------------------------------------------------ | -| def _init_config(self): | Read the config.json of the initialization model and perform some key name legalization operations | -| def _verify_params(self): | Verification parameters | -| def _init_mem_manager(self): | Initialize the mem manager object used by token attention | -| def _init_some_value(self): | Initialize the values ​​of some member variables used by the inference framework | -| def _init_custom(self): | Some models have their own personalized initialization, such as llama initializing its own Rotary value | +| def _init_config(self): | Read the config.json for initializing the model and perform some key name legalization operations | +| def _verify_params(self): | Validate parameters | +| def _init_mem_manager(self): | Initialize the mem manager object used by token attention | +| def _init_some_value(self): | Initialize values of some member variables that the inference framework will use | +| def _init_custom(self): | Some personalized initialization of the model itself, such as llama initializing its own Rotary values | -## 2. the example of adding bloom model +## 2. Example of Adding Bloom Model -The specific implementation is in the ***lightllm/models/bloom*** directory. Please read the corresponding source code for the following code snippets. The triton_kernel directory contains some kernels used by the inference class, which will not be introduced in detail below. At the same time, the bloom model uses the default state class because it does not need to pass special state information. If you want to understand the entire framework more deeply, you can further refer to the access implementation source code of models such as llama and llama2. +The specific implementation is under the ***lightllm/models/bloom*** directory. Please read the source code for the code snippets below. The triton_kernel directory contains some kernels used by inference classes, which will not be introduced in detail in this article. At the same time, the bloom model uses the default state class because it doesn't need to pass special state information. For a deeper understanding of the entire framework, you can further refer to the source code implementation of llama and llama2 model integration. -### (1) Add implementation weight class +### (1) Add Implementation Weight Classes ***pre_and_post_layer_weight.py*** diff --git a/docs/EN/source/models/supported_models.rst b/docs/EN/source/models/supported_models.rst index 761348329..bc1945666 100755 --- a/docs/EN/source/models/supported_models.rst +++ b/docs/EN/source/models/supported_models.rst @@ -1,24 +1,23 @@ -Supported Models -================ +Supported Models List +===================== -lightllm supports most mainstream open source large language models and multimodal models, and will continue to expand the list of supported models. In later versions, lightllm will support more types of models (such as reward models). +Lightllm supports most mainstream open-source large language models and multimodal models, and will continue to expand the list of supported models. In future versions, lightllm will support more types of models (such as reward models). .. note:: - Due to its lightweight design, Lightllm is highly extensible, which means that adding new model support is very simple. For more information, please refer to the **How to Add New Model Support** section. + Due to its lightweight design, Lightllm is highly extensible, which means that adding new model support is very simple. For more information, please refer to the **Adding New Models** section. ----- -LLM -^^^^^^^^^^^^^^^^^^^^^^ - +Large Language Models +^^^^^^^^^^^^^^^^^^^^^ .. list-table:: :widths: 25 25 :header-rows: 1 - * - model - - note + * - Model + - Notes * - `BLOOM `_ - * - `LLaMA `_ @@ -42,29 +41,29 @@ LLM * - `MiniCPM `_ - * - `Phi-3 `_ - - only supports Mini and Small. + - Only supports Mini and Small. * - `CohereForAI `_ - :code:`--data_type bfloat16` * - `DeepSeek-V2-Lite `_ - :code:`--data_type bfloat16` * - `DeepSeek-V2 `_ - :code:`--data_type bfloat16` + * - `DeepSeek-V3 `_ + - * - `Qwen3 `_ - * - `Qwen3-Moe `_ - - - -VLM +Multimodal Models ^^^^^^^^^^^^^^^^^ .. list-table:: :widths: 25 25 :header-rows: 1 - * - model - - note + * - Model + - Notes * - `Qwen-VL `_ - :code:`--trust_remote_code --enable_multimodal` * - `Qwen-VL-Chat `_ @@ -73,19 +72,22 @@ VLM - :code:`--enable_multimodal` * - `Llava-13b `_ - :code:`--enable_multimodal` + * - `Qwen2-VL `_ + - :code:`--enable_multimodal` * - `Google Gemma3 `_ - :code:`--enable_multimodal` - -Reward Model -^^^^^^^^^^^^^^^^^ +Reward Models +^^^^^^^^^^^^^ .. list-table:: :widths: 25 25 :header-rows: 1 - * - model - - note + * - Model + - Notes * - `internLM-reward `_ - :code:`--use_reward_model` + * - `Qwen2-Reward `_ + - :code:`--use_reward_model` diff --git a/docs/EN/source/models/test.rst b/docs/EN/source/models/test.rst deleted file mode 100755 index b599bc72f..000000000 --- a/docs/EN/source/models/test.rst +++ /dev/null @@ -1,273 +0,0 @@ -Examples -================ - -LLaMA -^^^^^^^^^^^^^^^^^^^^^ - -**Launching Server** - -.. code-block:: console - - $ python -m lightllm.server.api_server --model_dir /path/llama-7B \ - $ --host 0.0.0.0 \ - $ --port 8080 \ - $ --tp 1 \ - $ --max_total_token_num 120000 - -.. tip:: - - The parameter `max_total_token_num` is influenced by the GPU memory of the deployment environment. You can also specify `--mem_faction` to have it calculated automatically. - -.. code-block:: console - - $ python -m lightllm.server.api_server --model_dir /path/llama-7B \ - $ --host 0.0.0.0 \ - $ --port 8080 \ - $ --tp 1 \ - $ --mem_faction 0.9 - -**Test Server** - -.. code-block:: console - - $ curl http://127.0.0.1:8080/generate \ - $ -X POST \ - $ -d '{"inputs":"What is AI?","parameters":{"max_new_tokens":17, "frequency_penalty":1}}' \ - $ -H 'Content-Type: application/json' - -.. code-block:: python - - import time - import requests - import json - - url = 'http://localhost:8080/generate' - headers = {'Content-Type': 'application/json'} - data = { - 'inputs': 'What is AI?', - "parameters": { - 'do_sample': False, - 'ignore_eos': False, - 'max_new_tokens': 1024, - } - } - response = requests.post(url, headers=headers, data=json.dumps(data)) - if response.status_code == 200: - print(response.json()) - else: - print('Error:', response.status_code, response.text) - -Qwen2-0.5B -^^^^^^^^^^^^^^^^^^^^^ - -**Launching Server** - -.. code-block:: console - - $ python -m lightllm.server.api_server --model_dir ~/models/Qwen2-0.5B \ - $ --trust_remote_code - -**Test Server** - -.. code-block:: console - - $ curl http://localhost:8000/generate \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "inputs": "What is AI?", - $ "parameters":{ - $ "max_new_tokens":17, - $ "frequency_penalty":1 - $ } - $ }' - - -Qwen-VL-Chat -^^^^^^^^^^^^^^^^^ - -**Launching Server** - -.. code-block:: console - - $ python -m lightllm.server.api_server --model_dir ~/models/Qwen-VL-Chat \ - $ --trust_remote_code \ - $ --enable_multimodal - -**Test Server** - -.. code-block:: python - - import json - import requests - import base64 - - def run(query, uris): - images = [] - for uri in uris: - if uri.startswith("http"): - images.append({"type": "url", "data": uri}) - else: - with open(uri, 'rb') as fin: - b64 = base64.b64encode(fin.read()).decode("utf-8") - images.append({'type': "base64", "data": b64}) - - data = { - "inputs": query, - "parameters": { - "max_new_tokens": 200, - # The space before <|endoftext|> is important, - # the server will remove the first bos_token_id, - # but QWen tokenizer does not has bos_token_id - "stop_sequences": [" <|endoftext|>", " <|im_start|>", " <|im_end|>"], - }, - "multimodal_params": { - "images": images, - } - } - - url = "http://127.0.0.1:8000/generate" - headers = {'Content-Type': 'application/json'} - response = requests.post(url, headers=headers, data=json.dumps(data)) - return response - - query = """ - <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - - what is this?<|im_end|> - <|im_start|>assistant - """ - - response = run( - uris = [ - "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" - ], - query = query - ) - - if response.status_code == 200: - print(f"Result: {response.json()}") - else: - print(f"Error: {response.status_code}, {response.text}") - -Llava -^^^^^^^^^^^^^^^^^ - -**Launching Server** - -.. code-block:: console - - $ python -m lightllm.server.api_server \ - $ --host 0.0.0.0 \ - $ --port 8080 \ - $ --tp 1 \ - $ --max_total_token_num 12000 \ - $ --trust_remote_code \ - $ --enable_multimodal \ - $ --cache_capacity 1000 \ - $ --model_dir /path/of/llava-v1.5-7b or /path/of/llava-v1.5-13b - -**Test Server** - -.. code-block:: python - - import time - import requests - import json - import base64 - - url = 'http://localhost:8080/generate' - headers = {'Content-Type': 'application/json'} - - uri = "/local/path/of/image" # or "/http/path/of/image" - if uri.startswith("http"): - images = [{"type": "url", "data": uri}] - else: - with open(uri, 'rb') as fin: - b64 = base64.b64encode(fin.read()).decode("utf-8") - images=[{'type': "base64", "data": b64}] - - data = { - "inputs": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nPlease explain the picture. ASSISTANT:", - "parameters": { - "max_new_tokens": 200, - }, - "multimodal_params": { - "images": images, - } - } - - response = requests.post(url, headers=headers, data=json.dumps(data)) - if response.status_code == 200: - print(response.json()) - else: - print('Error:', response.status_code, response.text) - - -internlm2-1_8b -^^^^^^^^^^^^^^^^^^^^^^^ - -**Launching Server** - -.. code-block:: console - - $ python -m lightllm.server.api_server --model_dir ~/models/internlm2-1_8b \ - $ --trust_remote_code - - -**Test Server** - -.. code-block:: console - - $ curl http://localhost:8000/generate \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "inputs": "What is LLM?", - $ "parameters":{ - $ "max_new_tokens":170, - $ "frequency_penalty":1 - $ } - $ }' - - -internlm2-1_8b-reward -^^^^^^^^^^^^^^^^^^^^^^^ - -**Launching Server** - -.. code-block:: console - - $ python -m lightllm.server.api_server --model_dir ~/models/internlm2-1_8b-reward \ - $ --use_reward_model \ - $ --trust_remote_code - -.. tip:: - - ``--use_reward_model`` Indicates options that must be turned on to use the reward model. - - -**Test Server** - -.. code-block:: python - - import json - import requests - - query = "<|im_start|>user\nHello! What's your name?<|im_end|>\n<|im_start|>assistant\nMy name is InternLM2! A helpful AI assistant. What can I do for you?<|im_end|>\n<|reward|>" - - url = "http://127.0.0.1:8000/get_score" - headers = {'Content-Type': 'application/json'} - - data = { - "chat": query, - "parameters": { - "frequency_penalty":1 - } - } - response = requests.post(url, headers=headers, data=json.dumps(data)) - - if response.status_code == 200: - print(f"Result: {response.json()}") - else: - print(f"Error: {response.status_code}, {response.text}") \ No newline at end of file diff --git a/docs/EN/source/server/api_server_args.rst b/docs/EN/source/server/api_server_args.rst deleted file mode 100755 index 98c1c61bb..000000000 --- a/docs/EN/source/server/api_server_args.rst +++ /dev/null @@ -1,12 +0,0 @@ -APIServer Args -============================= - - -Usage -++++++++++++ - -.. argparse:: - :module: lightllm.server.api_cli - :func: make_argument_parser - :prog: python -m lightllm.server.api_server - :nodefaultconst: \ No newline at end of file diff --git a/docs/EN/source/server/benchmark.rst b/docs/EN/source/server/benchmark.rst deleted file mode 100755 index 8487da111..000000000 --- a/docs/EN/source/server/benchmark.rst +++ /dev/null @@ -1,43 +0,0 @@ -Benchmark -================== - -After deploying the model, it is very important to evaluate the service performance. By adjusting the configuration based on the service performance, the graphics card resources can be better utilized. -In this article, we use the LLaMA-7B model to compare the performance of lightllm and vLLM==0.1.2 on an 80G A800 graphics card. -For the specific comparison method, please refer to the following steps: - -1. Download datasets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: console - - $ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - -2. Launching Server -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: console - - $ python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto - - -3. Benchmark -^^^^^^^^^^^^^^^^ - -.. code-block:: console - - $ cd test - $ python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200 - - -output: - -.. code-block:: console - - read data set finish - total tokens: 494250 - Total time: 111.37 s - Throughput: 8.98 requests/s - Average latency: 43.52 s - Average latency per token: 0.15 s - Average latency per output token: 0.73 s \ No newline at end of file diff --git a/docs/EN/source/user/api_param.rst b/docs/EN/source/tutorial/api_param.rst old mode 100755 new mode 100644 similarity index 76% rename from docs/EN/source/user/api_param.rst rename to docs/EN/source/tutorial/api_param.rst index 96577ede2..89474e617 --- a/docs/EN/source/user/api_param.rst +++ b/docs/EN/source/tutorial/api_param.rst @@ -1,56 +1,50 @@ -API parameter -========================== - +API Call Details +================ :code:`GET /health` -~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~ :code:`HEAD /health` -~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~ :code:`GET /healthz` -~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~ Get the current server running status -**Usage Examples**: +**Call Example**: .. code-block:: console $ curl http://0.0.0.0:8080/health - -**Output Examples**: +**Output Example**: .. code-block:: python {"message":"Ok"} - - :code:`GET /token_load` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~ -Get the current server token usage +Get the current server token usage status -**Usage Examples**: +**Call Example**: .. code-block:: console $ curl http://0.0.0.0:8080/token_load - -**Output Examples**: +**Output Example**: .. code-block:: python {"current_load":0.0,"logical_max_load":0.0,"dynamic_max_load":0.0} - :code:`POST /generate` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~ -Calling the model to implement text completion +Call the model to implement text completion -**Usage Examples**: +**Call Example**: .. code-block:: console @@ -65,21 +59,18 @@ Calling the model to implement text completion $ "multimodal_params":{} $ }' - -**Output Examples**: +**Output Example**: .. code-block:: python {"generated_text": [" What is the difference between AI and ML? What are the differences between AI and ML"], "count_output_tokens": 17, "finish_reason": "length", "prompt_tokens": 4} - :code:`POST /generate_stream` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Streaming returns text completion results +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Stream return text completion results -**Usage Examples**: +**Call Example**: .. code-block:: console @@ -94,7 +85,7 @@ Streaming returns text completion results $ "multimodal_params":{} $ }' -**Output Examples**: +**Output Example**: :: @@ -104,12 +95,11 @@ Streaming returns text completion results data:{"token": {"id": 279, "text": " the", "logprob": -1.5594439506530762, "special": false, "count_output_tokens": 3, "prompt_tokens": 4}, "generated_text": null, "finished": true, "finish_reason": "length", "details": null} - :code:`POST /get_score` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Reward model, get the dialogue score. +~~~~~~~~~~~~~~~~~~~~~~~ +Reward model, get conversation score -**Usage Examples**: +**Call Example**: .. code-block:: python @@ -134,14 +124,8 @@ Reward model, get the dialogue score. else: print(f"Error: {response.status_code}, {response.text}") -**Output Examples**: +**Output Example**: :: - Result: {'score': 0.4892578125, 'prompt_tokens': 39, 'finish_reason': 'stop'} - - -:code:`POST /v1/chat/completions` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -openai type api, see `openai API docs `_ for details. + Result: {'score': 0.4892578125, 'prompt_tokens': 39, 'finish_reason': 'stop'} \ No newline at end of file diff --git a/docs/EN/source/tutorial/api_server_args_zh.rst b/docs/EN/source/tutorial/api_server_args_zh.rst new file mode 100644 index 000000000..a409777e8 --- /dev/null +++ b/docs/EN/source/tutorial/api_server_args_zh.rst @@ -0,0 +1,199 @@ +APIServer Parameter Details +========================== + +This document provides detailed information about all startup parameters and their usage for LightLLM APIServer. + +Basic Configuration Parameters +----------------------------- + +.. option:: --run_mode + + Set the running mode, optional values: + + * ``normal``: Single server mode (default) + * ``prefill``: Prefill mode (for pd separation running mode) + * ``decode``: Decode mode (for pd separation running mode) + * ``pd_master``: pd master node mode (for pd separation running mode) + * ``config_server``: Configuration server mode (for pd separation mode, used to register pd_master nodes and get pd_master node list), specifically designed for large-scale, high-concurrency scenarios, used when `pd_master` encounters significant CPU bottlenecks. + +.. option:: --host + + Server listening address, default is ``127.0.0.1`` + +.. option:: --port + + Server listening port, default is ``8000`` + +.. option:: --httpserver_workers + + HTTP server worker process count, default is ``1`` + +.. option:: --zmq_mode + + ZMQ communication mode, optional values: + + * ``tcp://``: TCP mode + * ``ipc:///tmp/``: IPC mode (default) + + Can only choose from ``['tcp://', 'ipc:///tmp/']`` + +PD Separation Mode Parameters +---------------------------- + +.. option:: --pd_master_ip + + PD master node IP address, default is ``0.0.0.0`` + + This parameter needs to be set when run_mode is set to prefill or decode + +.. option:: --pd_master_port + + PD master node port, default is ``1212`` + + This parameter needs to be set when run_mode is set to prefill or decode + +.. option:: --pd_decode_rpyc_port + + Port used by decode nodes for kv move manager rpyc server in PD mode, default is ``42000`` + +.. option:: --config_server_host + + Host address in configuration server mode + +.. option:: --config_server_port + + Port number in configuration server mode + +Model Configuration Parameters +----------------------------- + +.. option:: --model_name + + Model name, used to distinguish internal model names, default is ``default_model_name`` + + Can be obtained via ``host:port/get_model_name`` + +.. option:: --model_dir + + Model weight directory path, the application will load configuration, weights, and tokenizer from this directory + +.. option:: --tokenizer_mode + + Tokenizer loading mode, optional values: + + * ``slow``: Slow mode, loads fast but runs slow, suitable for debugging and testing + * ``fast``: Fast mode (default), achieves best performance + * ``auto``: Auto mode, tries to use fast mode, falls back to slow mode if it fails + +.. option:: --load_way + + Model weight loading method, default is ``HF`` (Huggingface format) + + Llama models also support ``DS`` (Deepspeed) format + +.. option:: --trust_remote_code + + Whether to allow using custom model definition files on Hub + +Memory and Batch Processing Parameters +------------------------------------ + +.. option:: --max_total_token_num + + Total token count supported by GPU and model, equals max_batch * (input_len + output_len) + + If not specified, will be automatically calculated based on mem_fraction + +.. option:: --mem_fraction + + Memory usage ratio, default is ``0.9`` + + If OOM occurs during runtime, you can specify a smaller value + +.. option:: --batch_max_tokens + + Maximum token count for new batches, controls prefill batch size to prevent OOM + +.. option:: --running_max_req_size + + Maximum number of requests for simultaneous forward inference, default is ``1000`` + +.. option:: --max_req_total_len + + Maximum value of request input length + request output length, default is ``16384`` + +.. option:: --eos_id + + End stop token ID, can specify multiple values. If None, will be loaded from config.json + +.. option:: --tool_call_parser + + OpenAI interface tool call parser type, optional values: + + * ``qwen25`` + * ``llama3`` + * ``mistral`` + +Different Parallel Mode Setting Parameters +---------------------------------------- + +.. option:: --nnodes + + Number of nodes, default is ``1`` + +.. option:: --node_rank + + Current node rank, default is ``0`` + +.. option:: --multinode_httpmanager_port + + Multi-node HTTP manager port, default is ``12345`` + +.. option:: --multinode_router_gloo_port + + Multi-node router gloo port, default is ``20001`` + +.. option:: --tp + + Model tensor parallelism size, default is ``1`` + +.. option:: --dp + + Data parallelism size, default is ``1`` + + This is a useful parameter for deepseekv2. When using deepseekv2 model, set dp equal to the tp parameter. + In other cases, please do not set it, keep the default value of 1. + +.. option:: --nccl_host + + nccl_host used to build PyTorch distributed environment, default is ``127.0.0.1`` + + For multi-node deployment, should be set to the master node's IP + +.. option:: --nccl_port + + nccl_port used to build PyTorch distributed environment, default is ``28765`` + +.. option:: --use_config_server_to_init_nccl + + Use tcp store server started by config_server to initialize nccl, default is False + + When set to True, --nccl_host must equal config_server_host, --nccl_port must be unique for config_server, + do not use the same nccl_port for different inference nodes, this will be a serious error + +Attention Type Selection Parameters +--------------------------------- + +.. option:: --mode + + Model inference mode, can specify multiple values: + + * ``triton_int8kv``: Use int8 to store kv cache, can increase token capacity, uses triton kernel + * ``ppl_int8kv``: Use int8 to store kv cache, uses ppl fast kernel + * ``ppl_fp16``: Use ppl fast fp16 decode attention kernel + * ``triton_flashdecoding``: Flashdecoding mode for long context, currently supports llama llama2 qwen + * ``triton_gqa_attention``: Fast kernel for models using GQA + * ``triton_gqa_flashdecoding``: Fast flashdecoding kernel for models using GQA + * ``triton_fp8kv``: Use float8 to store kv cache, currently only used for deepseek2 + + Need to read source code to confirm specific modes supported by all models \ No newline at end of file diff --git a/docs/EN/source/tutorial/deepseek_deployment.rst b/docs/EN/source/tutorial/deepseek_deployment.rst new file mode 100644 index 000000000..35f54ea1a --- /dev/null +++ b/docs/EN/source/tutorial/deepseek_deployment.rst @@ -0,0 +1,200 @@ +.. _deepseek_deployment: + +DeepSeek Model Deployment Guide +=============================== + +LightLLM supports various deployment solutions for DeepSeek models, including DeepSeek-R1, DeepSeek-V2, DeepSeek-V3, etc. This document provides detailed information on various deployment modes and configuration solutions. + +Deployment Mode Overview +----------------------- + +LightLLM supports the following deployment modes: + +1. **Single Machine TP Mode**: Deploy using tensor parallelism on a single machine +2. **Single Machine EP Mode**: Deploy using expert parallelism on a single machine +3. **Multi-Machine TP Mode**: Use tensor parallelism across multiple machines +4. **Multi-Machine EP Mode**: Use expert parallelism across multiple machines +5. **PD Separation Mode**: Separate prefill and decode deployment +6. **Multi PD Master Mode**: Support multiple PD Master nodes + +1. Single Machine Deployment Solutions +------------------------------------- + +1.1 Single Machine TP Mode (Tensor Parallel) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suitable for deploying DeepSeek-R1 model on a single H200 machine. + +**Launch Command:** + +.. code-block:: bash + + # H200 Single Machine DeepSeek-R1 TP Mode + LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 8 \ + --enable_fa3 + +**Parameter Description:** +- `LOADWORKER=18`: Model loading thread count, improves loading speed +- `--tp 8`: Tensor parallelism degree, using 8 GPUs +- `--enable_fa3`: Enable Flash Attention 3.0 +- `--port 8088`: Service port + +1.2 Single Machine DP + EP Mode (Data Parallel + Expert Parallel) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suitable for expert parallelism deployment of MoE models like DeepSeek-V2/V3. + +**Launch Command:** + +.. code-block:: bash + + # H200 Single Machine DeepSeek-R1 DP + EP Mode + MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 8 \ + --dp 8 \ + --enable_fa3 + +**Parameter Description:** +- `MOE_MODE=EP`: Set expert parallelism mode +- `--tp 8`: Tensor parallelism degree +- `--dp 8`: Data parallelism degree, usually set to the same value as tp +- `--enable_fa3`: Enable Flash Attention 3.0 + +**Optional Optimization Parameters:** +- `--enable_prefill_microbatch_overlap`: Enable prefill microbatch overlap +- `--enable_decode_microbatch_overlap`: Enable decode microbatch overlap + +2. Multi-Machine Deployment Solutions +------------------------------------ + +2.1 Multi-Machine TP Mode +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suitable for deployment across multiple H200/H100 machines. + +**Node 0 Launch Command:** + +.. code-block:: bash + + # H200/H100 Multi-Machine DeepSeek-R1 TP Mode Node 0 + # Usage: sh multi_node_tp_node0.sh + export nccl_host=$1 + LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 16 \ + --enable_fa3 \ + --nnodes 2 \ + --node_rank 0 \ + --nccl_host $nccl_host \ + --nccl_port 2732 + +**Node 1 Launch Command:** + +.. code-block:: bash + + # H200/H100 Multi-Machine DeepSeek-R1 TP Mode Node 1 + # Usage: sh multi_node_tp_node1.sh + export nccl_host=$1 + LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 16 \ + --enable_fa3 \ + --nnodes 2 \ + --node_rank 1 \ + --nccl_host $nccl_host \ + --nccl_port 2732 + +**Parameter Description:** +- `--nnodes 2`: Total number of nodes +- `--node_rank 0/1`: Current node rank +- `--nccl_host`: NCCL communication host address +- `--nccl_port 2732`: NCCL communication port + +2.2 Multi-Machine EP Mode +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suitable for deploying MoE models across multiple machines. + +**Node 0 Launch Command:** + +.. code-block:: bash + + # H200 Multi-Machine DeepSeek-R1 EP Mode Node 0 + # Usage: sh multi_node_ep_node0.sh + export nccl_host=$1 + MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 16 \ + --dp 16 \ + --enable_fa3 \ + --nnodes 2 \ + --node_rank 0 \ + --nccl_host $nccl_host \ + --nccl_port 2732 + +**Node 1 Launch Command:** + +.. code-block:: bash + + # H200 Multi-Machine DeepSeek-R1 EP Mode Node 1 + # Usage: sh multi_node_ep_node1.sh + export nccl_host=$1 + MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ + --model_dir /path/DeepSeek-R1 \ + --tp 16 \ + --dp 16 \ + --enable_fa3 \ + --nnodes 2 \ + --node_rank 1 \ + --nccl_host $nccl_host \ + --nccl_port 2732 + +**Optional Optimization Parameters:** +- `--enable_prefill_microbatch_overlap`: Enable prefill microbatch overlap +- `--enable_decode_microbatch_overlap`: Enable decode microbatch overlap + +3. PD Separation Deployment Solutions +------------------------------------ + +PD (Prefill-Decode) separation mode separates prefill and decode stages for deployment, which can better utilize hardware resources. + +3.1 Single PD Master Mode +~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Step 1: Launch PD Master Service** + +.. code-block:: bash + + # PD Master for DeepSeek-R1 + # Usage: sh pd_master.sh + export pd_master_ip=$1 + python -m lightllm.server.api_server --model_dir /path/DeepSeek-R1 \ + --run_mode "pd_master" \ + --host $pd_master_ip \ + --port 60011 + +**Step 2: Launch Prefill Service** + +.. code-block:: bash + + # PD prefill mode for DeepSeek-R1 (DP+EP) on H200 + # Usage: sh pd_prefill.sh + # nvidia-cuda-mps-control -d, run MPS (optional, performance will be much better with mps support, but some graphics cards and driver environments may encounter errors when enabling mps, it's recommended to upgrade to a higher driver version, especially for H-series cards) + + export host=$1 + export pd_master_ip=$2 + nvidia-cuda-mps-control -d + MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \ + --model_dir /path/DeepSeek-R1 \ + --run_mode "prefill" \ + --tp 8 \ + --dp 8 \ + --host $host \ + --port 8019 \ + --nccl_port 2732 \ + --enable_fa3 \ + --disable_cudagraph \ + --pd_master_ip $pd_master_ip \ No newline at end of file diff --git a/docs/EN/source/tutorial/multimodal.rst b/docs/EN/source/tutorial/multimodal.rst new file mode 100644 index 000000000..1b25fae88 --- /dev/null +++ b/docs/EN/source/tutorial/multimodal.rst @@ -0,0 +1,139 @@ +Multimodal Model Launch Configuration +==================================== + +LightLLM supports inference for various multimodal models. Below, using InternVL as an example, we explain the launch commands for multimodal services. + +Basic Launch Command +------------------- + +.. code-block:: bash + + INTERNVL_IMAGE_LENGTH=256 \ + LOADWORKER=12 \ + python -m lightllm.server.api_server \ + --port 8080 \ + --tp 2 \ + --model_dir ${MODEL_PATH} \ + --mem_fraction 0.8 \ + --trust_remote_code \ + --enable_multimodal + +Core Parameter Description +------------------------- + +Environment Variables +^^^^^^^^^^^^^^^^^^^^ + +- **INTERNVL_IMAGE_LENGTH**: Set the image token length for InternVL model, default is 256 +- **LOADWORKER**: Set the number of worker processes for model loading + +Basic Service Parameters +^^^^^^^^^^^^^^^^^^^^^^^ + +- **--port 8080**: API server listening port +- **--tp 2**: Tensor parallelism degree +- **--model_dir**: InternVL model file path +- **--mem_fraction 0.8**: GPU memory usage ratio +- **--trust_remote_code**: Allow loading custom model code +- **--enable_multimodal**: Enable multimodal functionality + +Advanced Configuration Parameters +-------------------------------- + +.. code-block:: bash + + --visual_infer_batch_size 2 \ + --cache_capacity 500 \ + --visual_dp dp_size \ + --visual_tp tp_size + +- **--visual_infer_batch_size 2**: Visual inference batch size +- **--cache_capacity 500**: Image embedding cache capacity +- **--visual_dp 2**: Visual model data parallelism degree +- **--visual_tp 2**: Visual model tensor parallelism degree + +.. note:: To ensure equal memory load on each GPU, visual_dp * visual_tp = tp is required. For example, if tp=2, then visual_dp=1, visual_tp=2. + +ViT Deployment Methods +---------------------- + +ViT TP (Tensor Parallel) +^^^^^^^^^^^^^^^^^^^^^^^ + +- Default usage +- --visual_tp tp_size enables tensor parallelism + +ViT DP (Data Parallel) +^^^^^^^^^^^^^^^^^^^^^ + +- Distribute different image batches to multiple GPUs +- Each GPU runs a complete ViT model copy +- --visual_dp dp_size enables data parallelism + +Image Caching Mechanism +---------------------- +LightLLM caches embeddings of input images. In multi-turn conversations, if the images are the same, cached embeddings can be used directly, avoiding repeated inference. + +- **--cache_capacity**: Controls the number of cached image embeds +- Matching based on image MD5 hash value +- Uses LRU (Least Recently Used) eviction mechanism +- Hit image cache can directly skip ViT inference + +Testing +------- + +.. code-block:: python + + import json + import requests + import base64 + + def run(query, uris): + images = [] + for uri in uris: + if uri.startswith("http"): + images.append({"type": "url", "data": uri}) + else: + with open(uri, 'rb') as fin: + b64 = base64.b64encode(fin.read()).decode("utf-8") + images.append({'type': "base64", "data": b64}) + + data = { + "inputs": query, + "parameters": { + "max_new_tokens": 200, + # The space before <|endoftext|> is important, + # the server will remove the first bos_token_id, + # but QWen tokenizer does not has bos_token_id + "stop_sequences": [" <|endoftext|>", " <|im_start|>", " <|im_end|>"], + }, + "multimodal_params": { + "images": images, + } + } + + url = "http://127.0.0.1:8000/generate" + headers = {'Content-Type': 'application/json'} + response = requests.post(url, headers=headers, data=json.dumps(data)) + return response + + query = """ + <|im_start|>system + You are a helpful assistant.<|im_end|> + <|im_start|>user + + What is this?<|im_end|> + <|im_start|>assistant + """ + + response = run( + uris = [ + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" + ], + query = query + ) + + if response.status_code == 200: + print(f"Result: {response.json()}") + else: + print(f"Error: {response.status_code}, {response.text}") \ No newline at end of file diff --git a/docs/EN/source/tutorial/openai.rst b/docs/EN/source/tutorial/openai.rst new file mode 100644 index 000000000..270e29802 --- /dev/null +++ b/docs/EN/source/tutorial/openai.rst @@ -0,0 +1,203 @@ +.. _openai_api: + +LightLLM OpenAI API Usage Examples +================================== + +LightLLM provides an interface that is fully compatible with OpenAI API, supporting all standard OpenAI features including function calling. This document provides detailed information on how to use LightLLM's OpenAI interface. + +Basic Configuration +------------------ + +First, ensure that the LightLLM service is started: + +.. code-block:: bash + + # Start LightLLM service + python -m lightllm.server.api_server \ + --model_dir /path/to/your/model \ + --port 8088 \ + --tp 1 + +Basic Conversation Examples +-------------------------- + +1. Simple Conversation +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + # Configuration + url = "http://localhost:8088/v1/chat/completions" + headers = {"Content-Type": "application/json"} + + # Request data + data = { + "model": "your_model_name", + "messages": [ + {"role": "user", "content": "Hello, please introduce yourself"} + ], + "temperature": 0.7, + "max_tokens": 1000 + } + + # Send request + response = requests.post(url, headers=headers, json=data) + + if response.status_code == 200: + result = response.json() + print("Reply:", result["choices"][0]["message"]["content"]) + else: + print("Error:", response.status_code, response.text) + +2. Streaming Conversation +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + headers = {"Content-Type": "application/json"} + + data = { + "model": "your_model_name", + "messages": [ + {"role": "user", "content": "Please write a short essay about artificial intelligence"} + ], + "stream": True, + "temperature": 0.7, + "max_tokens": 1000 + } + + # Streaming request + response = requests.post(url, headers=headers, json=data, stream=True) + + if response.status_code == 200: + for line in response.iter_lines(): + if line: + line = line.decode('utf-8') + if line.startswith('data: '): + data_str = line[6:] # Remove "data: " prefix + if data_str == '[DONE]': + break + try: + chunk = json.loads(data_str) + if chunk['choices'][0]['delta'].get('content'): + print(chunk['choices'][0]['delta']['content'], end='', flush=True) + except json.JSONDecodeError: + continue + else: + print("Error:", response.status_code, response.text) + +Function Calling Examples +------------------------ + +LightLLM supports OpenAI's function calling functionality, providing function call parsing for three models. Specify the --tool_call_parser parameter when starting the service to choose. The service launch command is: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/your/model \ + --port 8088 \ + --tp 1 \ + --tool_call_parser qwen25 + # Optional parameters are qwen25, llama3, mistral + +1. Basic Function Calling +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + headers = {"Content-Type": "application/json"} + + # Define functions + tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get current weather information for a specified city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "City name, e.g.: Beijing, Shanghai" + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "Temperature unit" + } + }, + "required": ["city"] + } + } + } + ] + + # Request data + data = { + "model": "your_model_name", + "messages": [ + {"role": "user", "content": "What's the weather like in Beijing today?"} + ], + "tools": tools, + "tool_choice": "auto", # Let the model automatically decide whether to call functions + "temperature": 0.7, + "max_tokens": 1000 + } + + # Send request + response = requests.post(url, headers=headers, json=data) + + if response.status_code == 200: + result = response.json() + message = result["choices"][0]["message"] + + # Check if there are function calls + if message.get("tool_calls"): + print("Model decided to call functions:") + for tool_call in message["tool_calls"]: + print(f"Function name: {tool_call['function']['name']}") + print(f"Arguments: {tool_call['function']['arguments']}") + else: + print("Reply:", message["content"]) + else: + print("Error:", response.status_code, response.text) + +2. Streaming Function Calling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + headers = {"Content-Type": "application/json"} + + tools = [ + { + "type": "function", + "function": { + "name": "calculate", + "description": "Perform mathematical calculations", + "parameters": { + "type": "object", + "properties": { + "expression": {"type": "string", "description": "Mathematical expression"} + }, + "required": ["expression"] + } + } + } + ] \ No newline at end of file diff --git a/docs/EN/source/tutorial/reward_model.rst b/docs/EN/source/tutorial/reward_model.rst new file mode 100644 index 000000000..d00f8b9fd --- /dev/null +++ b/docs/EN/source/tutorial/reward_model.rst @@ -0,0 +1,62 @@ +Reward Model Deployment Configuration +==================================== + +LightLLM supports inference for various reward models, used for evaluating conversation quality and generating reward scores. Currently supported reward models include InternLM2 Reward and Qwen2 Reward, etc. + +Basic Launch Command +--------------------- + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --port 8080 \ + --model_dir ${MODEL_PATH} \ + --trust_remote_code \ + --use_reward_model # Enable reward model functionality (required parameter) + +Testing Examples +---------------- + +Python Testing Code +^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + import json + import requests + + # InternLM2 Reward test + query = "<|im_start|>user\nHello! What's your name?<|im_end|>\n<|im_start|>assistant\nMy name is InternLM2! A helpful AI assistant. What can I do for you?<|im_end|>\n<|reward|>" + + url = "http://127.0.0.1:8000/get_score" + headers = {'Content-Type': 'application/json'} + + data = { + "chat": query, + "parameters": { + "frequency_penalty": 1 + } + } + + response = requests.post(url, headers=headers, data=json.dumps(data)) + + if response.status_code == 200: + result = response.json() + print(f"Reward score: {result['score']}") + print(f"Input tokens: {result['prompt_tokens']}") + else: + print(f"Error: {response.status_code}, {response.text}") + +cURL Testing Command +^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + curl http://localhost:8000/get_score \ + -H "Content-Type: application/json" \ + -d '{ + "chat": "<|im_start|>user\nHello! What is AI?<|im_end|>\n<|im_start|>assistant\nAI stands for Artificial Intelligence, which refers to the simulation of human intelligence in machines.<|im_end|>\n<|reward|>", + "parameters": { + "frequency_penalty": 1 + } + }' \ No newline at end of file diff --git a/docs/EN/source/user/openapi_docs.rst b/docs/EN/source/user/openapi_docs.rst deleted file mode 100755 index 5af28ac4a..000000000 --- a/docs/EN/source/user/openapi_docs.rst +++ /dev/null @@ -1,43 +0,0 @@ -OpenApi docs -================================= - -The following documentation is automatically generated by openapi. After deploying with Lightllm, you can open it using ``host:port/docs`` - -.. raw:: html - - - - - - FastAPI - Swagger UI - - - -
-
- - - - - diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index e9943b05f..601b2a48a 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -375,7 +375,7 @@ def make_argument_parser() -> argparse.ArgumentParser: type=str, default=None, help="""Path of quantization config. It can be used for mixed quantization. - Examples can be found in lightllm/common/quantization/configs.""", + Examples can be found in test/advanced_config/mixed_quantization/llamacls-mix-down.yaml.""", ) parser.add_argument( "--vit_quant_type", diff --git a/lightllm/common/quantization/configs/llamacls-mix-down.yaml b/test/advanced_config/mixed_quantization/llamacls-mix-down.yaml similarity index 100% rename from lightllm/common/quantization/configs/llamacls-mix-down.yaml rename to test/advanced_config/mixed_quantization/llamacls-mix-down.yaml diff --git a/test/test_redundancy_expert_config.json b/test/advanced_config/redundancy_expert/test_redundancy_expert_config.json similarity index 100% rename from test/test_redundancy_expert_config.json rename to test/advanced_config/redundancy_expert/test_redundancy_expert_config.json diff --git a/test/benchmark/kernel/benchmark_fused_moe_triton.py b/test/benchmark/kernel/benchmark_fused_moe_triton.py new file mode 100644 index 000000000..6f7a5ee39 --- /dev/null +++ b/test/benchmark/kernel/benchmark_fused_moe_triton.py @@ -0,0 +1,330 @@ +# Adapted from +# https://github.com/sgl-project/sglang/blob/v0.4.6.post5/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py +import argparse + +import torch +import triton +import vllm +from transformers import AutoConfig +from lightllm.common.fused_moe.topk_select import select_experts +from lightllm.common.fused_moe.grouped_fused_moe import fused_experts_impl +from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe as fused_moe_vllm +from sglang.srt.layers.moe.fused_moe_triton.fused_moe import ( + fused_moe as fused_moe_sglang, +) + + +def get_model_config(model_name: str, tp_size: int): + """Get model configuration parameters""" + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + + if config.architectures[0] == "DbrxForCausalLM": + E = config.ffn_config.moe_num_experts + topk = config.ffn_config.moe_top_k + intermediate_size = config.ffn_config.ffn_hidden_size + shard_intermediate_size = 2 * intermediate_size // tp_size + elif config.architectures[0] == "JambaForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size + elif config.architectures[0] == "Qwen2MoeForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size + elif config.architectures[0] == "Qwen3MoeForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size + elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: + E = config.n_routed_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size + elif config.architectures[0] in [ + "Grok1ForCausalLM", + "Grok1ImgGen", + "Grok1AForCausalLM", + ]: + E = config.num_local_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size + else: + # Default: Mixtral + E = config.num_local_experts + topk = config.num_experts_per_tok + intermediate_size = config.intermediate_size + shard_intermediate_size = 2 * intermediate_size // tp_size + + vllm_version_num = vllm.__version_tuple__[0] * 100 + vllm.__version_tuple__[1] * 10 + vllm.__version_tuple__[2] + block_shape = None + if hasattr(config, "quantization_config") and "weight_block_size" in config.quantization_config: + block_shape = config.quantization_config["weight_block_size"] + assert len(block_shape) == 2 + assert vllm_version_num >= 66, "Block-wise quantized fp8 fused_moe is only supported for VLLM>=0.6.6.post1" + + shape_configs = { + "num_experts": E, + "topk": topk, + "hidden_size": config.hidden_size, + "shard_intermediate_size": shard_intermediate_size, + "dtype": config.torch_dtype, + "block_shape": block_shape, + } + print(f"{shape_configs=}") + return shape_configs + + +def fused_moe_lightllm_api( + x, + w1, + w2, + input_gating, + topk, + use_fp8_w8a8=False, + w1_scale=None, + w2_scale=None, + a1_scale=None, + a2_scale=None, + block_shape=None, +): + + topk_weights, topk_ids = select_experts( + hidden_states=x, + router_logits=input_gating, + correction_bias=None, + use_grouped_topk=False, + top_k=topk, + renormalize=True, + topk_group=None, + num_expert_group=None, + scoring_func="softmax", + ) + use_fp8_w8a8 = use_fp8_w8a8 + + return fused_experts_impl( + hidden_states=x, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) + + +def fused_moe_vllm_api( + x, + w1, + w2, + input_gating, + topk, + use_fp8_w8a8=False, + w1_scale=None, + w2_scale=None, + a1_scale=None, + a2_scale=None, + block_shape=None, +): + if block_shape is not None: + return fused_moe_vllm( + x, + w1, + w2, + input_gating, + topk, + renormalize=True, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape, + ) + else: + return fused_moe_vllm( + x, + w1, + w2, + input_gating, + topk, + renormalize=True, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + ) + + +def fused_moe_sglang_api( + x, + w1, + w2, + input_gating, + topk, + use_fp8_w8a8=False, + w1_scale=None, + w2_scale=None, + a1_scale=None, + a2_scale=None, + block_shape=None, +): + return fused_moe_sglang( + x, + w1, + w2, + input_gating, + topk, + renormalize=True, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape, + ) + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 8, 16, 32, 64, 128], + line_arg="provider", + line_vals=[ + "vllm_fused_moe_triton", + "sglang_fused_moe_triton", + "lightllm_fused_moe_triton", + ], + line_names=[ + "vllm_fused_moe_triton", + "sglang_fused_moe_triton", + "lightllm_fused_moe_triton", + ], + styles=[ + ("blue", "-"), + ("green", "-"), + ("red", "-"), + ], + ylabel="Time (ms)", + plot_name="fused-moe-performance", + args={}, + ) +) +def benchmark(batch_size, provider, model_config, use_fp8=False): + torch.set_default_device("cuda") + torch.cuda.manual_seed_all(0) + + num_tokens = batch_size + num_experts = model_config["num_experts"] + hidden_size = model_config["hidden_size"] + shard_intermediate_size = model_config["shard_intermediate_size"] + topk = model_config["topk"] + dtype = model_config["dtype"] + block_shape = getattr(model_config, "block_shape", None) + block_shape = [128, 128] + x = torch.randn(num_tokens, hidden_size, dtype=dtype) + w1_scale = w2_scale = a1_scale = a2_scale = None + + if use_fp8: + init_dtype = dtype + w1 = torch.randn(num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype) + w2 = torch.randn(num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype) + w1 = w1.to(torch.float8_e4m3fn) + w2 = w2.to(torch.float8_e4m3fn) + + if block_shape is None: + w1_scale = torch.randn(num_experts, dtype=torch.float32) + w2_scale = torch.randn(num_experts, dtype=torch.float32) + a1_scale = torch.randn(1, dtype=torch.float32) + a2_scale = torch.randn(1, dtype=torch.float32) + else: + block_n, block_k = block_shape[0], block_shape[1] + n_tiles_w1 = (shard_intermediate_size + block_n - 1) // block_n + n_tiles_w2 = (hidden_size + block_n - 1) // block_n + k_tiles_w1 = (hidden_size + block_k - 1) // block_k + k_tiles_w2 = (shard_intermediate_size // 2 + block_k - 1) // block_k + w1_scale = torch.rand((num_experts, n_tiles_w1, k_tiles_w1), dtype=torch.float32) + w2_scale = torch.rand((num_experts, n_tiles_w2, k_tiles_w2), dtype=torch.float32) + else: + w1 = torch.randn(num_experts, shard_intermediate_size, hidden_size, dtype=dtype) + w2 = torch.randn(num_experts, hidden_size, shard_intermediate_size // 2, dtype=dtype) + + input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32) + + # Warmup + api_func = ( + fused_moe_vllm_api + if provider == "vllm_fused_moe_triton" + else fused_moe_sglang_api + if provider == "lightllm_fused_moe_triton" + else fused_moe_lightllm_api + ) + for _ in range(10): + api_func( + x, + w1, + w2, + input_gating, + topk, + use_fp8_w8a8=use_fp8, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape, + ) + torch.cuda.synchronize() + + quantiles = [0.5, 0.2, 0.8] + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: api_func( + x, + w1, + w2, + input_gating, + topk, + use_fp8_w8a8=use_fp8, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape, + )[0], + quantiles=quantiles, + ) + return ms, min_ms, max_ms + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1") + parser.add_argument("--tp-size", type=int, default=8) + parser.add_argument("--use-fp8", action="store_true") + parser.add_argument( + "--save-path", + type=str, + default="./configs/benchmark_ops/vllm_sglang_fused_moe/", + ) + args = parser.parse_args() + + model_config = get_model_config(args.model, args.tp_size) + benchmark.run( + show_plots=True, + print_data=True, + save_path=args.save_path, + model_config=model_config, + use_fp8=args.use_fp8, + ) + + +if __name__ == "__main__": + main() diff --git a/test/benchmark_client.py b/test/benchmark/service/benchmark_client.py similarity index 100% rename from test/benchmark_client.py rename to test/benchmark/service/benchmark_client.py diff --git a/test/benchmark_mcq.py b/test/benchmark/service/benchmark_mcq.py similarity index 68% rename from test/benchmark_mcq.py rename to test/benchmark/service/benchmark_mcq.py index 51cdee830..828a970cc 100644 --- a/test/benchmark_mcq.py +++ b/test/benchmark/service/benchmark_mcq.py @@ -26,13 +26,13 @@ import aiohttp import numpy as np -from transformers import PreTrainedTokenizerBase from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase -from transformers import (AutoTokenizer, PreTrainedTokenizer, - PreTrainedTokenizerFast) +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast QUESTION = {} + + def get_tokenizer( tokenizer_name: str, tokenizer_mode: str = "slow", @@ -42,25 +42,21 @@ def get_tokenizer( """Gets a tokenizer for the given model name via Huggingface.""" if tokenizer_mode == "slow": if kwargs.get("use_fast", False): - raise ValueError( - "Cannot use the fast tokenizer in slow tokenizer mode.") + raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") kwargs["use_fast"] = True if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True): pass try: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args, - **kwargs) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args, **kwargs) except TypeError as e: - err_msg = ( - "Failed to load the tokenizer. If you are using a LLaMA-based " - f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original " - "tokenizer.") + err_msg = "Failed to load the tokenizer. {e}" raise RuntimeError(err_msg) from e if not isinstance(tokenizer, PreTrainedTokenizerFast): pass return tokenizer + # (prompt len, output len, latency) REQUEST_LATENCY: List[Tuple[int, int, float]] = [] @@ -73,11 +69,10 @@ def sample_requests( data = [] with open(dataset_path, "r") as f: questions = f.readlines() - gts = {} for question in questions: question = json.loads(question.strip()) file_name = question["file_name"].split(".")[0] - data.append((file_name, question['question_id'], question['instruction'], question['answer'])) + data.append((file_name, question["question_id"], question["instruction"], question["answer"])) if file_name not in QUESTION: QUESTION[file_name] = {} QUESTION[file_name][question["question_id"]] = [question["answer"]] @@ -107,25 +102,22 @@ async def send_request( output_len: int, port: int, ) -> None: - request_start_time = time.time() - headers = {'Content-Type': 'application/json'} + headers = {"Content-Type": "application/json"} headers = {"User-Agent": "Benchmark Client"} - file_name, question_id, inputs, answer = request - prompt = f"<系统> <对话历史> <知识> <最新问题> 用户:给出以下问题的答案:\n{inputs} SenseChat:" - print(prompt) - # prompt= "[Round {}]\n\n问:{}\n\n答:".format(1, inputs) - url = f'http://localhost:{port}/generate' + file_name, question_id, inputs, answer = request + prompt = "[Round {}]\n\n问:{}\n\n答:".format(1, inputs) + url = f"http://localhost:{port}/generate" data = { - 'inputs': prompt, - 'parameters': { - 'do_sample': False, - 'ignore_eos': True, - 'max_new_tokens': output_len, - # 'do_sample':True, + "inputs": prompt, + "parameters": { + "do_sample": False, + "ignore_eos": True, + "max_new_tokens": output_len, + # 'do_sample':True, # 'top_p':0.8, # 'temperature':0.8 - # 'temperature': 0.1, - } + # 'temperature': 0.1, + }, } timeout = aiohttp.ClientTimeout(total=3 * 3600) async with aiohttp.ClientSession(timeout=timeout) as session: @@ -140,6 +132,7 @@ async def send_request( if "error" not in output: break + async def benchmark( input_requests: List[Tuple[str, int, int]], request_rate: float, @@ -153,18 +146,18 @@ async def benchmark( def IsOpen(ip, port): - s = socket.socket(socket.AF_INET,socket.SOCK_STREAM) - index=1 + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: - s.connect((ip,int(port))) + s.connect((ip, int(port))) s.shutdown(2) - print('successfully launch model') + print("successfully launch model") return True except: time.sleep(10) return False + def main(args: argparse.Namespace): print(args) random.seed(args.seed) @@ -172,7 +165,6 @@ def main(args: argparse.Namespace): tokenizer = get_tokenizer(args.tokenizer, "slow") input_requests = sample_requests(args.dataset, tokenizer) - benchmark_start_time = time.time() asyncio.run(benchmark(input_requests, args.request_rate, args.port)) rights, alls = 0, 0 for file_name in QUESTION: @@ -186,19 +178,19 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Benchmark the online serving throughput.") - parser.add_argument("--dataset", type=str, required=True, - help="Path to the dataset.") - parser.add_argument("--tokenizer", type=str, required=True, - help="Name or path of the tokenizer.") - parser.add_argument("--request-rate", type=float, default=float("inf"), - help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize " - "the request arrival times.") - parser.add_argument("--port", type=int, default=8000, - help="port number") + parser = argparse.ArgumentParser(description="Benchmark the online serving throughput.") + parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset.") + parser.add_argument("--tokenizer", type=str, required=True, help="Name or path of the tokenizer.") + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process to synthesize " + "the request arrival times.", + ) + parser.add_argument("--port", type=int, default=8000, help="port number") parser.add_argument("--seed", type=int, default=0) args = parser.parse_args() main(args) diff --git a/test/server/benchmark_prompt_cache.py b/test/benchmark/service/benchmark_prompt_cache.py similarity index 87% rename from test/server/benchmark_prompt_cache.py rename to test/benchmark/service/benchmark_prompt_cache.py index 7a52420b0..66fcb5296 100644 --- a/test/server/benchmark_prompt_cache.py +++ b/test/benchmark/service/benchmark_prompt_cache.py @@ -1,3 +1,25 @@ +""" +This script benchmarks the performance of a large language model inference service via HTTP API, +supporting multi-user and multi-turn dialogue scenarios. + +Main arguments: +- --model_url: Service address +- --model_name: Model name (for result file naming) +- --num_workers: Number of concurrent processes +- --first_input_len: Input length for the first turn +- --subsequent_input_len: Input length for subsequent turns +- --output_len: Number of tokens generated per turn +- --num_turns: Number of dialogue turns per user +- --num_users: Number of users +- --result_dir: Directory to save results +- --print: Whether to print the result +- --cache: Whether to cache the result +- --use_cache: Whether to use cached results + +Example usage: +python benchmark_prompt_cache.py --address http://localhost:8090 --model_name llama \\ +--num_workers 1 --first_input_len 512 --subsequent_input_len 32 --output_len 32 --num_turns 5 --num_users 1 +""" import requests import json import operator diff --git a/test/server/test_settings.py b/test/benchmark/service/benchmark_prompt_cache_multi_server.py similarity index 71% rename from test/server/test_settings.py rename to test/benchmark/service/benchmark_prompt_cache_multi_server.py index 3acf17376..ac22a56cc 100644 --- a/test/server/test_settings.py +++ b/test/benchmark/service/benchmark_prompt_cache_multi_server.py @@ -1,3 +1,28 @@ +""" +benchmark_multi_server.py + +This script is used for automated benchmarking of multiple model services (e.g., llama-7b, llama-13b), +evaluating their performance under different input lengths, output lengths, number of turns, concurrent users, +and worker threads. + +Main features: +- Supports automated testing for multiple models and parameter combinations. +- Collects and outputs various performance metrics, including throughput, QPS, and latency. +- Saves results as a Markdown table for easy analysis. + +Parameter description: +- models: Model names and their service URLs to be tested. +- first_input_lens: List of token lengths for the first input. +- subsequent_input_lens: List of token lengths for subsequent inputs. +- output_lens: List of output token lengths. +- num_turns: List of dialogue turns. +- num_workers: List of concurrent worker counts. +- num_users: List of concurrent user counts. +- result_dir: Directory to save results. + +Example: + python benchmark_multi_server.py +""" import os import itertools from easydict import EasyDict diff --git a/test/benchmark_qps.py b/test/benchmark/service/benchmark_qps.py similarity index 100% rename from test/benchmark_qps.py rename to test/benchmark/service/benchmark_qps.py diff --git a/test/benchmark_serving.py b/test/benchmark/service/benchmark_sharegpt.py similarity index 71% rename from test/benchmark_serving.py rename to test/benchmark/service/benchmark_sharegpt.py index 9cde7fd8d..c9f92f098 100644 --- a/test/benchmark_serving.py +++ b/test/benchmark/service/benchmark_sharegpt.py @@ -25,11 +25,10 @@ import aiohttp import numpy as np -from transformers import PreTrainedTokenizerBase from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase -from transformers import (AutoTokenizer, PreTrainedTokenizer, - PreTrainedTokenizerFast) +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + def get_tokenizer( tokenizer_name: str, @@ -40,26 +39,22 @@ def get_tokenizer( """Gets a tokenizer for the given model name via Huggingface.""" if tokenizer_mode == "slow": if kwargs.get("use_fast", False): - raise ValueError( - "Cannot use the fast tokenizer in slow tokenizer mode.") + raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") kwargs["use_fast"] = False if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True): pass try: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args, - **kwargs) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args, **kwargs) except TypeError as e: - err_msg = ( - "Failed to load the tokenizer. If you are using a LLaMA-based " - f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original " - "tokenizer.") + err_msg = "Failed to load the tokenizer. {e}" raise RuntimeError(err_msg) from e if not isinstance(tokenizer, PreTrainedTokenizerFast): pass return tokenizer + # (prompt len, output len, latency) REQUEST_LATENCY: List[Tuple[int, int, float]] = [] @@ -73,23 +68,18 @@ def sample_requests( with open(dataset_path) as f: dataset = json.load(f) # Filter out the conversations with less than 2 turns. - dataset = [ - data for data in dataset - if len(data["conversations"]) >= 2 - ] + dataset = [data for data in dataset if len(data["conversations"]) >= 2] # Only keep the first two turns of each conversation. - dataset = [ - (data["conversations"][0]["value"], data["conversations"][1]["value"]) - for data in dataset - ] - + dataset = [(data["conversations"][0]["value"], data["conversations"][1]["value"]) for data in dataset] + print("read data set finish") # Tokenize the prompts and completions. import random + dataset = random.sample(dataset, num_requests * 3) prompts = [prompt for prompt, _ in dataset] completions = [completion for _, completion in dataset] - + prompt_token_ids = tokenizer(prompts).input_ids completion_token_ids = tokenizer(completions).input_ids tokenized_dataset = [] @@ -135,26 +125,21 @@ async def get_request( await asyncio.sleep(interval) -async def send_request( - prompt: str, - prompt_len: int, - output_len: int -) -> None: +async def send_request(prompt: str, prompt_len: int, output_len: int) -> None: request_start_time = time.time() - headers = {'Content-Type': 'application/json'} + headers = {"Content-Type": "application/json"} headers = {"User-Agent": "Benchmark Client"} - url = 'http://localhost:8000/generate' - + url = "http://localhost:8000/generate" + data = { - 'inputs': prompt, - 'parameters': { - 'do_sample': False, - 'ignore_eos': True, - 'max_new_tokens': output_len, - # 'temperature': 0.1, - } + "inputs": prompt, + "parameters": { + "do_sample": False, + "ignore_eos": True, + "max_new_tokens": output_len, + # 'temperature': 0.1, + }, } - timeout = aiohttp.ClientTimeout(total=3 * 3600) async with aiohttp.ClientSession(timeout=timeout) as session: @@ -165,7 +150,7 @@ async def send_request( chunks.append(chunk) output = b"".join(chunks).decode("utf-8") output = json.loads(output) - + if "error" not in output: break @@ -181,8 +166,7 @@ async def benchmark( tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): prompt, prompt_len, output_len = request - task = asyncio.create_task(send_request(prompt, - prompt_len, output_len)) + task = asyncio.create_task(send_request(prompt, prompt_len, output_len)) tasks.append(task) await asyncio.gather(*tasks) @@ -204,33 +188,28 @@ def main(args: argparse.Namespace): # Compute the latency statistics. avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY]) print(f"Average latency: {avg_latency:.2f} s") - avg_per_token_latency = np.mean([ - latency / (prompt_len + output_len) - for prompt_len, output_len, latency in REQUEST_LATENCY - ]) + avg_per_token_latency = np.mean( + [latency / (prompt_len + output_len) for prompt_len, output_len, latency in REQUEST_LATENCY] + ) print(f"Average latency per token: {avg_per_token_latency:.2f} s") - avg_per_output_token_latency = np.mean([ - latency / output_len - for _, output_len, latency in REQUEST_LATENCY - ]) - print("Average latency per output token: " - f"{avg_per_output_token_latency:.2f} s") + avg_per_output_token_latency = np.mean([latency / output_len for _, output_len, latency in REQUEST_LATENCY]) + print("Average latency per output token: " f"{avg_per_output_token_latency:.2f} s") if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Benchmark the online serving throughput.") - parser.add_argument("--dataset", type=str, required=True, - help="Path to the dataset.") - parser.add_argument("--tokenizer", type=str, required=True, - help="Name or path of the tokenizer.") - parser.add_argument("--request-rate", type=float, default=float("inf"), - help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize " - "the request arrival times.") - parser.add_argument("--num-prompts", type=int, default=1000, - help="Number of prompts to process.") + parser = argparse.ArgumentParser(description="Benchmark the online serving throughput.") + parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset.") + parser.add_argument("--tokenizer", type=str, required=True, help="Name or path of the tokenizer.") + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process to synthesize " + "the request arrival times.", + ) + parser.add_argument("--num-prompts", type=int, default=1000, help="Number of prompts to process.") parser.add_argument("--seed", type=int, default=0) args = parser.parse_args() main(args) diff --git a/test/model/model_infer.py b/test/benchmark/static_inference/model_infer.py similarity index 55% rename from test/model/model_infer.py rename to test/benchmark/static_inference/model_infer.py index 3fe91d716..6cd97cfdf 100644 --- a/test/model/model_infer.py +++ b/test/benchmark/static_inference/model_infer.py @@ -7,7 +7,7 @@ from lightllm.utils.dist_utils import init_distributed_env, get_current_rank_in_dp from lightllm.utils.envs_utils import get_env_start_args from lightllm.models import get_model -from lightllm.common.basemodel.microbatch_overlap_objs import DecodeMicroBatch, PrefillMicroBatch +from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput from torch.profiler import profile, record_function, ProfilerActivity from lightllm.utils.log_utils import init_logger import torch.cuda as cuda @@ -35,9 +35,9 @@ def test_model_inference(args): "max_total_token_num": args.max_total_token_num, "graph_max_len_in_batch": args.max_req_total_len, "graph_max_batch_size": args.graph_max_batch_size, - "mem_faction": args.mem_fraction, - "max_req_num": max(args.batch_size, 2048), - "batch_max_tokens": args.batch_size * args.input_len, + "mem_fraction": args.mem_fraction, + "max_req_num": 2048, + "batch_max_tokens": 1024, "run_mode": "normal", "max_seq_length": args.max_req_total_len, "disable_cudagraph": args.disable_cudagraph, @@ -77,7 +77,7 @@ def overlap_prefill( _0_b_req_idx = b_req_idx[: batch_size // 2] _0_b_seq_len = b_seq_len[: batch_size // 2] _o_b_ready_cache_len = b_ready_cache_len[: batch_size // 2] - micro_batch1 = PrefillMicroBatch( + micro_batch1 = ModelInput( _0_batch_size, _0_total_token_num, _0_max_len_in_batch, @@ -85,6 +85,7 @@ def overlap_prefill( _0_mem_indexes, _0_b_req_idx, _0_b_seq_len, + True, _o_b_ready_cache_len, {}, ) @@ -98,7 +99,7 @@ def overlap_prefill( _1_b_seq_len = b_seq_len[batch_size // 2 :] _1_b_ready_cache_len = b_ready_cache_len[batch_size // 2 :] - micro_batch2 = PrefillMicroBatch( + micro_batch2 = ModelInput( _1_batch_size, _1_total_token_num, _1_max_len_in_batch, @@ -106,11 +107,14 @@ def overlap_prefill( _1_mem_indexes, _1_b_req_idx, _1_b_seq_len, + True, _1_b_ready_cache_len, {}, ) - logits, logits1 = model_part.microbatch_overlap_prefill(micro_batch1, micro_batch2) + output, output1 = model_part.microbatch_overlap_prefill(micro_batch1, micro_batch2) + logits = output.logits + logits1 = output1.logits return torch.cat((logits, logits1), dim=0) @@ -124,7 +128,7 @@ def overlap_decode( _0_mem_indexes = mem_indexes[: batch_size // 2] _0_b_req_idx = b_req_idx[: batch_size // 2] _0_b_seq_len = b_seq_len[: batch_size // 2] - micro_batch1 = DecodeMicroBatch( + micro_batch1 = ModelInput( _0_batch_size, _0_total_token_num, _0_max_len_in_batch, @@ -142,7 +146,7 @@ def overlap_decode( _1_b_req_idx = b_req_idx[batch_size // 2 :] _1_b_seq_len = b_seq_len[batch_size // 2 :] - micro_batch2 = DecodeMicroBatch( + micro_batch2 = ModelInput( _1_batch_size, _1_total_token_num, _1_max_len_in_batch, @@ -152,12 +156,40 @@ def overlap_decode( _1_b_seq_len, ) - logits, logits1 = model_part.microbatch_overlap_decode(micro_batch1, micro_batch2) + output, output1 = model_part.microbatch_overlap_decode(micro_batch1, micro_batch2) + logits = output.logits + logits1 = output1.logits return torch.cat((logits, logits1), dim=0) +def prefill( + model_part, + batch_size, + max_len_in_batch, + input_ids, + mem_indexes, + b_req_idx, + b_seq_len, + total_token_num, + b_ready_cache_len, +): + model_input = ModelInput( + batch_size, + total_token_num, + max_len_in_batch, + input_ids, + mem_indexes, + b_req_idx, + b_seq_len, + is_prefill=True, + b_ready_cache_len=b_ready_cache_len, + ) + model_output = model_part.forward(model_input) + return model_output.logits + + def decode(model_part, batch_size, max_len_in_batch, input_ids, mem_indexes, b_req_idx, b_seq_len, total_token_num): - logits = model_part.forward( + model_input = ModelInput( batch_size, total_token_num, max_len_in_batch, @@ -167,7 +199,8 @@ def decode(model_part, batch_size, max_len_in_batch, input_ids, mem_indexes, b_r b_seq_len, is_prefill=False, ) - return logits + model_output = model_part.forward(model_input) + return model_output.logits def torch_profile(fn, log_dir=None): @@ -183,123 +216,25 @@ def torch_profile(fn, log_dir=None): print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) -def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, ans_queue): - args = get_env_start_args() - import triton.profiler as proton - import torch - from lightllm.distributed import dist_group_manager - from lightllm.utils.dist_utils import set_current_device_id - - import torch.distributed as dist - - enable_decode_overlap = args.enable_decode_microbatch_overlap - group_size = 1 - if enable_decode_overlap or args.enable_prefill_microbatch_overlap: - assert batch_size % 2 == 0, "batch size must be even number" - group_size = 2 - init_distributed_env(model_kvargs) - dist_group_manager.create_groups(group_size=group_size) - model_cfg, _ = PretrainedConfig.get_config_dict(model_kvargs["weight_dir"]) - dist.barrier() - - torch.cuda.empty_cache() - - model_part, _ = get_model(model_cfg, model_kvargs) - - # warm up - # test_data = np.vstack([np.arange(5, input_len + 5) for _ in range(batch_size)]) +def run_forward_once(model_kvargs, input_len, output_len, batch_size, model_part, enable_overlap, torch_profile=False): test_data = np.vstack([np.random.randint(0, 50256, input_len) for _ in range(batch_size)]) test_data = test_data.reshape(-1) test_data = torch.from_numpy(test_data).cuda() - - b_req_idx = torch.tensor( - [model_part.req_manager.alloc() for _ in range(batch_size)], dtype=torch.int32, device="cuda" - ) - b_seq_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda") - b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda") - for i in range(batch_size): - b_seq_len[i] = input_len - - total_token_num = input_len * batch_size - mem_indexes = model_part.req_manager.mem_manager.alloc(test_data.shape[0]).cuda() - if args.enable_prefill_microbatch_overlap: - logics = overlap_prefill( - model_part, - batch_size, - input_len, - test_data, - mem_indexes, - b_req_idx, - b_seq_len, - total_token_num, - b_ready_cache_len, - ) - else: - logics = model_part.forward( - batch_size, - total_token_num, - input_len, - test_data, - mem_indexes, - b_req_idx, - b_seq_len, - b_ready_cache_len=b_ready_cache_len, - is_prefill=True, - ) - prob_out = torch.softmax(logics, dim=-1) - predict_ids = torch.argmax(prob_out, dim=1, keepdim=True) - predict_ids = predict_ids.detach().cpu().numpy() - - for i in range(output_len): - total_token_num += batch_size - b_seq_len += 1 - mem_indexes = model_part.req_manager.mem_manager.alloc(predict_ids.shape[0]).cuda() - max_len_in_batch = input_len + i + 1 - if enable_decode_overlap: - logits = overlap_decode( - model_part, - batch_size, - max_len_in_batch, - torch.from_numpy(predict_ids).cuda().reshape(-1), - mem_indexes, - b_req_idx, - b_seq_len, - total_token_num, - ) - else: - logits = decode( - model_part, - batch_size, - max_len_in_batch, - torch.from_numpy(predict_ids).cuda().reshape(-1), - mem_indexes, - b_req_idx, - b_seq_len, - total_token_num, - ) - - prob_out = torch.softmax(logits, dim=-1) - predict_ids = torch.argmax(prob_out, dim=1, keepdim=True) - predict_ids = predict_ids.detach().cpu().numpy() - - model_part.mem_manager.free_all() - model_part.req_manager.free_all() - - b_req_idx = None - b_seq_len = None + import torch.distributed as dist dist.barrier() import time - torch.cuda.synchronize() - start_time = time.time() + dp_size = model_kvargs["dp_size"] + torch.cuda.synchronize() prefill_start_time = time.time() b_req_idx = torch.tensor( [model_part.req_manager.alloc() for _ in range(batch_size)], dtype=torch.int32, device="cuda" ) b_seq_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda") + b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda") for i in range(batch_size): b_seq_len[i] = input_len @@ -307,86 +242,59 @@ def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, an mem_indexes = model_part.req_manager.mem_manager.alloc(test_data.shape[0]).cuda() rank_id = model_kvargs["rank_id"] - if rank_id == 0: - if args.profile: - proton.start(name="forward_prefill", context="python") - if args.enable_prefill_microbatch_overlap: - logics = overlap_prefill( - model_part, - batch_size, - input_len, - test_data, - mem_indexes, - b_req_idx, - b_seq_len, - total_token_num, - b_ready_cache_len, - ) + if enable_overlap: + prefill_fn = overlap_prefill + decode_fn = overlap_decode else: - logics = model_part.forward( - batch_size, - total_token_num, - input_len, - test_data, - mem_indexes, - b_req_idx, - b_seq_len, - b_ready_cache_len=b_ready_cache_len, - is_prefill=True, - ) - prob_out = torch.softmax(logics, dim=-1) + prefill_fn = prefill + decode_fn = decode + + logits = prefill_fn( + model_part, + batch_size, + input_len, + test_data, + mem_indexes, + b_req_idx, + b_seq_len, + total_token_num, + b_ready_cache_len, # b_ready_cache_len + ) + + prob_out = torch.softmax(logits, dim=-1) predict_ids = torch.argmax(prob_out, dim=1, keepdim=True) - predict_ids = predict_ids.detach().cpu().numpy() + _ = predict_ids.detach().cpu().numpy() torch.cuda.synchronize() if rank_id == 0: - if args.profile: - proton.finalize() - print("prefill time cost:", (time.time() - prefill_start_time) * 1000) + print( + f"prefill time cost: {(time.time() - prefill_start_time) * 1000}, " + f"prefill throughput: {dp_size * batch_size * input_len / (time.time() - prefill_start_time)} tokens/s" + ) - if args.torch_profile: + if torch_profile: print("Profile Prefill") try: - if args.enable_prefill_microbatch_overlap: - torch_profile( - lambda: overlap_prefill( - model_part, - batch_size, - input_len, - test_data, - mem_indexes, - b_req_idx, - b_seq_len, - total_token_num, - b_ready_cache_len, - ), - log_dir=f"./logs/forward_prefill_{model_kvargs['rank_id']}", - ) - else: - torch_profile( - lambda: model_part.forward( - batch_size, - total_token_num, - input_len, - test_data, - mem_indexes, - b_req_idx, - b_seq_len, - b_ready_cache_len=b_ready_cache_len, - is_prefill=True, - ), - log_dir=f"./logs/forward_prefill_{model_kvargs['rank_id']}", - ) + torch_profile( + lambda: prefill_fn( + model_part, + batch_size, + input_len, + test_data, + mem_indexes, + b_req_idx, + b_seq_len, + total_token_num, + b_ready_cache_len, # b_ready_cache_len + ), + log_dir=f"./logs/forward_prefill_{model_kvargs['rank_id']}", + ) except Exception as e: print(str(e)) raise - if rank_id == 0: - if args.profile: - proton.start(name="forward_decode", context="python") - for i in range(output_len): torch.cuda.synchronize() step_start = time.time() @@ -394,49 +302,24 @@ def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, an b_seq_len += 1 mem_indexes = model_part.req_manager.mem_manager.alloc(predict_ids.shape[0]).cuda() max_len_in_batch = input_len + i + 1 - if enable_decode_overlap: - logits = overlap_decode( - model_part, - batch_size, - max_len_in_batch, - torch.from_numpy(predict_ids).cuda().reshape(-1), - mem_indexes, - b_req_idx, - b_seq_len, - total_token_num, - ) - if i == 0 and args.torch_profile: - torch_profile( - lambda: overlap_decode( - model_part, - batch_size, - max_len_in_batch, - torch.from_numpy(predict_ids).cuda().reshape(-1), - mem_indexes, - b_req_idx, - b_seq_len, - total_token_num, - ), - log_dir=f"./logs/forward_decode_{model_kvargs['rank_id']}", - ) - else: - logits = decode( - model_part, - batch_size, - max_len_in_batch, - torch.from_numpy(predict_ids).cuda().reshape(-1), - mem_indexes, - b_req_idx, - b_seq_len, - total_token_num, - ) - if i == 0 and args.torch_profile: + logits = decode_fn( + model_part, + batch_size, + max_len_in_batch, + predict_ids.view(-1), + mem_indexes, + b_req_idx, + b_seq_len, + total_token_num, + ) + if torch_profile: + try: torch_profile( - lambda: decode( + lambda: decode_fn( model_part, batch_size, max_len_in_batch, - torch.from_numpy(predict_ids).cuda().reshape(-1), + predict_ids.view(-1), mem_indexes, b_req_idx, b_seq_len, @@ -444,26 +327,86 @@ def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, an ), log_dir=f"./logs/forward_decode_{model_kvargs['rank_id']}", ) + except Exception as e: + print(str(e)) + raise prob_out = torch.softmax(logits, dim=-1) predict_ids = torch.argmax(prob_out, dim=1, keepdim=True) - predict_ids = predict_ids.detach().cpu().numpy() + _ = predict_ids.detach().cpu().numpy() torch.cuda.synchronize() if i % 100 == 0 or i == output_len - 1: if rank_id == 0: - print(i, "step cost time:", (time.time() - step_start) * 1000) + print( + f"i: {i}, step cost time: {(time.time() - step_start) * 1000} ms, " + f"throughput: {dp_size * batch_size / (time.time() - step_start)} tokens/s" + ) + model_part.mem_manager.free_all() + model_part.req_manager.free_all() torch.cuda.synchronize() - end_time = time.time() + torch.cuda.empty_cache() + + +def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, ans_queue): + args = get_env_start_args() + import triton.profiler as proton + import torch + from lightllm.distributed import dist_group_manager + from lightllm.utils.dist_utils import set_current_device_id + + if isinstance(batch_size, int): + batch_size = [batch_size] + else: + batch_size = [2, 8, 16, 32, 64, 128] + print(batch_size) + + import torch.distributed as dist + + enable_decode_overlap = args.enable_decode_microbatch_overlap + group_size = 1 + if enable_decode_overlap or args.enable_prefill_microbatch_overlap: + assert batch_size % 2 == 0, "batch size must be even number" + group_size = 2 + init_distributed_env(model_kvargs) + dist_group_manager.create_groups(group_size=group_size) + model_cfg, _ = PretrainedConfig.get_config_dict(model_kvargs["weight_dir"]) + dist.barrier() + + torch.cuda.empty_cache() + enable_overlap = args.enable_decode_microbatch_overlap or args.enable_prefill_microbatch_overlap + + model_part, _ = get_model(model_cfg, model_kvargs) + + rank_id = model_kvargs["rank_id"] + for b in batch_size: + if rank_id == 0: + print(f"Testing batch size {b}") + + # warm up + run_forward_once( + model_kvargs, + input_len, + output_len=10, + batch_size=b, + model_part=model_part, + enable_overlap=enable_overlap, + torch_profile=False, + ) + + # test + run_forward_once( + model_kvargs, + input_len, + output_len, + batch_size=b, + model_part=model_part, + enable_overlap=enable_overlap, + torch_profile=False, + ) + if rank_id == 0: + print("=" * 50) - if rank_id == 0: - if args.profile: - proton.finalize() - # triton version need >= 3.2.0 - # pip install llnl-hatchet - # proton-viewer -m time/ms,time/% forward_prefill.hatchet - # proton-viewer -m time/ms,time/% forward_decode.hatchet - print("time total cost(ms):", (end_time - start_time) * 1000) ans_queue.put(True) return diff --git a/test/model/model_infer_mtp.py b/test/benchmark/static_inference/model_infer_mtp.py similarity index 100% rename from test/model/model_infer_mtp.py rename to test/benchmark/static_inference/model_infer_mtp.py diff --git a/test/model/test_model.py b/test/benchmark/static_inference/test_model.py similarity index 89% rename from test/model/test_model.py rename to test/benchmark/static_inference/test_model.py index bf7d0ac43..5b3751bcc 100644 --- a/test/model/test_model.py +++ b/test/benchmark/static_inference/test_model.py @@ -27,8 +27,8 @@ def test_model_infer(self): import torch parser = make_argument_parser() - parser.add_argument("--batch_size", type=int, default=2, help="batch size") - parser.add_argument("--input_len", type=int, default=4096, help="input sequence length") + parser.add_argument("--batch_size", type=int, default=None, help="batch size") + parser.add_argument("--input_len", type=int, default=64, help="input sequence length") parser.add_argument("--output_len", type=int, default=128, help="output sequence length") parser.add_argument( "--profile", diff --git a/test/model/model_infer_vit.py b/test/benchmark/static_inference/test_vit.py similarity index 65% rename from test/model/model_infer_vit.py rename to test/benchmark/static_inference/test_vit.py index 556795c8e..279542ddd 100644 --- a/test/model/model_infer_vit.py +++ b/test/benchmark/static_inference/test_vit.py @@ -6,9 +6,10 @@ from lightllm.models.vit.model import VisionTransformer from lightllm.utils.dist_utils import init_vision_distributed_env +import argparse -def test_model_inference(world_size, weight_dir, quant_type=None): +def test_model_inference(world_size, weight_dir, quant_type=None, batch_size=1, image_size=448): workers = [] for rank_id in range(world_size): kvargs = { @@ -23,7 +24,7 @@ def test_model_inference(world_size, weight_dir, quant_type=None): "quant_cfg": None, } - proc = multiprocessing.Process(target=tppart_model_infer, args=(kvargs,)) + proc = multiprocessing.Process(target=tppart_model_infer, args=(kvargs, batch_size, image_size)) proc.start() workers.append(proc) @@ -32,7 +33,7 @@ def test_model_inference(world_size, weight_dir, quant_type=None): return -def tppart_model_infer(model_kvargs): +def tppart_model_infer(model_kvargs, batch_size, image_size): import torch import torch.distributed as dist @@ -41,7 +42,7 @@ def tppart_model_infer(model_kvargs): torch.cuda.empty_cache() model_part = VisionTransformer(model_kvargs) - test_data = torch.randn((13, 3, 448, 448)).cuda().to(torch.bfloat16) + test_data = torch.randn((batch_size, 3, image_size, image_size)).cuda().to(torch.bfloat16) # warm up torch.cuda.synchronize() for i in range(10): @@ -56,6 +57,7 @@ def tppart_model_infer(model_kvargs): end_time = time.time() if rank_id == 0: print("time total cost(ms):", (end_time - start_time) / 50 * 1000) + print("image per second:", batch_size * 50 / (end_time - start_time)) return @@ -63,7 +65,13 @@ def tppart_model_infer(model_kvargs): if __name__ == "__main__": import torch - world_size = 2 - weight_dir = "/nvme/models/InternVL2/InternVL2-8B/" + parser = argparse.ArgumentParser() + parser.add_argument("--model_dir", type=str, default="./InternVL2/InternVL2-8B/") + parser.add_argument("--world_size", type=int, default=2) + parser.add_argument("--quant_type", type=str, default="none") + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument("--image_size", type=int, default=448) + args = parser.parse_args() + torch.multiprocessing.set_start_method("spawn") - test_model_inference(world_size, weight_dir, "none") + test_model_inference(args.world_size, args.model_dir, args.quant_type, args.batch_size, args.image_size) diff --git a/test/compare_with_previous_commit.py b/test/compare_with_previous_commit.py new file mode 100644 index 000000000..3004e6146 --- /dev/null +++ b/test/compare_with_previous_commit.py @@ -0,0 +1,198 @@ +""" +This script starts the inference server, sends a set of prompts, +collects the outputs, and supports comparing the results between +the current commit and a specified historical commit for accuracy testing. + +The command is: +python compare_with_previous_commit..py --tp 2 --model_dir /xx/xx --compare_commit_id xxxx + +""" +import difflib +import argparse +import subprocess +import time +import os +import requests +import sys +import json +import shutil + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--tp", type=int, required=True, help="Number of GPUs to use.") + parser.add_argument("--model_dir", type=str, required=True, help="Directory of the model.") + parser.add_argument("--compare_commit_id", type=str, default=None, help="The commit id of the baseline.") + return parser.parse_args() + + +def start_server(tp, model_dir): + cmd = [ + "python", + "-m", + "lightllm.server.api_server", + "--tp", + str(tp), + "--model_dir", + model_dir, + "--data_type", + "fp16", + "--mode", + "triton_gqa_flashdecoding", + "--trust_remote_code", + "--tokenizer_mode", + "fast", + "--host", + "0.0.0.0", + "--port", + "8080", + ] + process = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr) + return process + + +def check_health(): + health_url = "http://localhost:8080/health" + try: + r = requests.get(health_url, timeout=2) + return r.status_code == 200 + except Exception: + return False + + +def send_prompts(prompts, output_file): + for prompt in prompts: + while not check_health(): + time.sleep(1) + + request_data = { + "inputs": prompt, + "parameters": {"max_new_tokens": 1024, "frequency_penalty": 1, "do_sample": False}, + "multimodal_params": {}, + } + + try: + r = requests.post("http://localhost:8080/generate", json=request_data, timeout=10) + response_json = json.loads(r.text) + generated_text = ( + response_json["generated_text"][0] if "generated_text" in response_json else "No generated_text." + ) + except Exception as e: + generated_text = f"ERROR: {str(e)}" + + with open(output_file, "a", encoding="utf-8") as f: + f.write(f"===== prompt: {prompt} =====\n") + f.write(f"{generated_text}\n\n") + + print(f"===================Ouput saved in {output_file}===========================") + + +def compare_files(file1, file2, diff_output_file="diff.txt"): + with open(file1, "r", encoding="utf-8") as f1, open(file2, "r", encoding="utf-8") as f2: + lines1 = f1.readlines() + lines2 = f2.readlines() + + diff = difflib.unified_diff(lines1, lines2, fromfile=file1, tofile=file2, lineterm="") + + diff_list = list(diff) + if not diff_list: + print("两个文件内容完全一致。") + return + + # 打印到终端 + for line in diff_list: + if line.startswith("+") and not line.startswith("+++"): + print("\033[32m" + line + "\033[0m", end="") # 绿色 + elif line.startswith("-") and not line.startswith("---"): + print("\033[31m" + line + "\033[0m", end="") # 红色 + else: + print(line, end="") + + # 保存到文件 + with open(diff_output_file, "w", encoding="utf-8") as f: + for line in diff_list: + f.write(line + "\n") + print(f"\n差异已保存到 {diff_output_file}") + + +def run_and_save(tp, model_dir, output_file, prompts): + """ + Start the server, send prompts, and save the results to output_file. + """ + # Remove the old result file if it exists + if os.path.exists(output_file): + os.remove(output_file) + + process = None + try: + # Start the inference server + process = start_server(tp, model_dir) + # Send prompts and save results + send_prompts(prompts, output_file) + finally: + # Shutdown the server + if process is not None: + process.terminate() + process.wait() + + +def main(): + # Parse arguments + args = parse_args() + tp = args.tp + model_dir = args.model_dir + compare_commit_id = args.compare_commit_id + + # Prompts to test + prompts = [ + "What is the machine learning?", + "1+1等于几", + "What role does attention play in transformer architectures?", + "西红柿炒鸡蛋怎么做?", + "Describe the concept of overfitting and underfitting.", + "CPU和GPU的区别是什么?", + "What is the role of a loss function in machine learning?", + ] + + # Run and save results for the current commit + current_output_file = "test_results_current.txt" + run_and_save(tp, model_dir, current_output_file, prompts) + + # If compare_commit_id is provided, run and save results for the baseline commit + if compare_commit_id: + # Get the absolute path of the current script + script_path = os.path.abspath(__file__) + script_name = os.path.basename(script_path) + tmp_script = f"/tmp/{script_name}" + # Copy the current script to /tmp to ensure it exists in the baseline commit + shutil.copy(script_path, tmp_script) + # Save current commit id + current_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip() + # Save current branch name (if any) + current_branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode().strip() + # Stash any local changes + subprocess.run(["git", "stash"]) + # Checkout the baseline commit + subprocess.run(["git", "checkout", compare_commit_id]) + # Copy the script back to the original location in case it does not exist in the baseline commit + shutil.copy(tmp_script, script_path) + try: + compare_output_file = "test_results_compare.txt" + run_and_save(tp, model_dir, compare_output_file, prompts) + finally: + # Checkout back to the original branch or commit + if current_branch != "HEAD": + subprocess.run(["git", "checkout", current_branch]) + else: + subprocess.run(["git", "checkout", current_commit]) + # Pop the stashed changes + subprocess.run(["git", "stash", "pop"]) + # Remove the temporary script file + if os.path.exists(tmp_script): + os.remove(tmp_script) + # Compare the results + compare_files(current_output_file, compare_output_file) + + +if __name__ == "__main__": + main() diff --git a/test/deepseek.sh b/test/deepseek.sh deleted file mode 100644 index 78e40a116..000000000 --- a/test/deepseek.sh +++ /dev/null @@ -1,87 +0,0 @@ -# 单机 deepseek V3 ep 运行模式启动示例, 启动参数中的tp含义发生了变化,代表使用的所有卡数量,并不是tp推理。 -# max_total_token_num 可以按照实际场景调节。 -MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 --model_dir /dev/shm/DeepSeek-R1 \ ---tp 8 \ ---dp 8 \ ---max_total_token_num 200000 \ ---graph_max_batch_size 64 \ ---batch_max_tokens 8192 \ ---enable_flashinfer_prefill \ ---enable_flashinfer_decode \ ---enable_prefill_microbatch_overlap \ ---disable_aggressive_schedule - -# H800 双机 deepseek V3 ep 运行模式启动实列 -# 启动命令中的 nccl_host 和 nccl_port 两个节点的必须一致,一般nccl_host设置为 node 0的ip。 -# max_total_token_num 最佳设置需要按照使用场景和显存情况配置。 -# 启动后两个节点的8088端口都可以接收访问的请求 -# node 0 -MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 --model_dir /dev/shm/DeepSeek-R1 \ ---tp 16 \ ---dp 16 \ ---max_total_token_num 200000 \ ---graph_max_batch_size 64 \ ---batch_max_tokens 8192 \ ---enable_flashinfer_prefill \ ---enable_flashinfer_decode \ ---enable_prefill_microbatch_overlap \ ---nnodes 2 \ ---node_rank 0 \ ---nccl_host \ ---nccl_port 2732 -# node 1 -MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 --model_dir /dev/shm/DeepSeek-R1 \ ---tp 16 \ ---dp 16 \ ---max_total_token_num 200000 \ ---graph_max_batch_size 64 \ ---batch_max_tokens 8192 \ ---enable_flashinfer_prefill \ ---enable_flashinfer_decode \ ---enable_prefill_microbatch_overlap \ ---nnodes 2 \ ---node_rank 1 \ ---nccl_host \ ---nccl_port 2732 - -# pd 分离启动示列, 单机 做 P 和 D, 也支持多机组成的D和单机的P混合。 -# 目前 P D 分离的 PD master可能存在并发处理问题,还需提升。 - -# pd master 启动 -python -m lightllm.server.api_server --model_dir /dev/shm/DeepSeek-R1 --run_mode "pd_master" --host `hostname -i` --port 60011 - -# p 启动 -nvidia-cuda-mps-control -d -MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server --model_dir /dev/shm/DeepSeek-R1 \ ---run_mode "prefill" \ ---tp 8 \ ---dp 8 \ ---host `hostname -i` \ ---port 8019 \ ---nccl_port 2732 \ ---max_total_token_num 200000 \ ---batch_max_tokens 8192 \ ---enable_flashinfer_prefill \ ---enable_flashinfer_decode \ ---enable_prefill_microbatch_overlap \ ---disable_cudagraph \ ---pd_master_ip \ ---pd_master_port 60011 - -# d 启动 -nvidia-cuda-mps-control -d -MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server --model_dir /dev/shm/DeepSeek-R1 \ ---run_mode "decode" \ ---tp 8 \ ---dp 8 \ ---host `hostname -i` \ ---port 8121 \ ---nccl_port 12322 \ ---max_total_token_num 200000 \ ---graph_max_batch_size 64 \ ---enable_flashinfer_prefill \ ---enable_flashinfer_decode \ ---enable_prefill_microbatch_overlap \ ---pd_master_ip \ ---pd_master_port 60011 - diff --git a/test/kernel/alignment/llama_gqa_decode_vsm.py b/test/kernel/alignment/llama_gqa_decode_vsm.py deleted file mode 100644 index f124a28eb..000000000 --- a/test/kernel/alignment/llama_gqa_decode_vsm.py +++ /dev/null @@ -1,104 +0,0 @@ -import unittest -import random -import torch -from tqdm import tqdm -from lightllm.common.basemodel.infer_struct import InferStateInfo -from lightllm.common.req_manager import ReqManager -from lightllm.models.llama.triton_kernel.gqa_flash_decoding_vsm import ( - gqa_token_decode_attention_flash_decoding_vsm, -) -from lightllm.models.llama.triton_kernel.gqa_flash_decoding import ( - gqa_token_decode_attention_flash_decoding, -) - - -class TestVSMGQADecoding(unittest.TestCase): - def test_vsm_gqa_decoding_align(self): - random.seed(0) - torch.manual_seed(0) - torch.cuda.manual_seed(0) - torch.cuda.manual_seed_all(0) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - - bs_list = [1, 8, 16, 32, 64, 128, 256] - group_size_list = [16, 32, 64] - seq_len_list = [128, 512, 1024, 2048, 4096, 8192] - q_head_dim_list = [64, 128] - q_head_num_list = [8, 16, 32] - - def get_test_configs(): - for bs in bs_list: - for group_size in group_size_list: - for seq_len_m in seq_len_list: - for q_head_dim in q_head_dim_list: - for q_head_num in q_head_num_list: - if q_head_num < group_size: - continue - yield bs, group_size, seq_len_m, q_head_dim, q_head_num - - for bs, group_size, seq_len_m, q_head_dim, q_head_num in tqdm(list(get_test_configs())): - kv_head_num = q_head_num // group_size - q_head_dim = q_head_dim - kv_head_dim = q_head_dim - seq_len = (torch.zeros(bs, dtype=torch.int32) + seq_len_m).to(torch.int32) - total_token_in_the_batch = seq_len.sum().item() - rounded_total_token_in_the_batch = (total_token_in_the_batch + 128 - 1) // 128 * 128 - - q_shape = [bs, q_head_num, q_head_dim] - kv_shape = [ - rounded_total_token_in_the_batch, - kv_head_num, - kv_head_dim, - ] - qkv_dtype = torch.float16 - - q, k, v = ( - torch.randn(q_shape, dtype=qkv_dtype, device="cuda"), - torch.randn(kv_shape, dtype=qkv_dtype, device="cuda"), - torch.randn(kv_shape, dtype=qkv_dtype, device="cuda"), - ) - q, k, v = q / 10, k / 10, v / 10 - - req_to_token_index = torch.zeros((bs, seq_len_m)) - 1 - token_index = torch.arange(rounded_total_token_in_the_batch) - - total_count = 0 - for i in range(bs): - req_to_token_index[i, : seq_len[i]] = token_index[total_count : total_count + seq_len[i]] - total_count += seq_len[i] - - req_to_token_index = req_to_token_index.long().cuda() - - b_req_idx = torch.arange(bs, device="cuda") - infer_state = InferStateInfo() - infer_state.req_manager = ReqManager(bs, 2048, None) - infer_state.req_manager.req_to_token_indexs = req_to_token_index - infer_state.b_req_idx = b_req_idx.cuda() - infer_state.b_seq_len = seq_len.cuda() - infer_state.max_len_in_batch = seq_len_m - infer_state.batch_size = bs - infer_state.q_head_num = q_head_num - infer_state.q_head_dim = q_head_dim - infer_state.kv_head_num = kv_head_num - infer_state.softmax_scale = 1 / (q_head_dim ** 0.5) - infer_state.total_token_num = torch.tensor([total_token_in_the_batch], dtype=torch.int32).cuda() - new_out = gqa_token_decode_attention_flash_decoding_vsm(q, k, v, infer_state) - old_out = gqa_token_decode_attention_flash_decoding( - q, - infer_state, - infer_state.q_head_num, - infer_state.q_head_dim, - k, - v, - ) - cos_sim = torch.nn.functional.cosine_similarity(new_out, old_out, dim=-1).mean().cpu().item() - self.assertGreaterEqual( - cos_sim, - 0.9, - f"bs={bs},group_size={group_size},seq_len={seq_len_m},q_head_dim={q_head_dim},q_head_num={q_head_num}", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/kernel/tuning/deepseekv2_gqa_decode_tuning.py b/test/kernel/deepseekv2_gqa_decode_tuning.py similarity index 100% rename from test/kernel/tuning/deepseekv2_gqa_decode_tuning.py rename to test/kernel/deepseekv2_gqa_decode_tuning.py diff --git a/test/kernel/fuse_moe_tuning_fp8.py b/test/kernel/fuse_moe_tuning.py similarity index 80% rename from test/kernel/fuse_moe_tuning_fp8.py rename to test/kernel/fuse_moe_tuning.py index a30de8d03..6e971573a 100644 --- a/test/kernel/fuse_moe_tuning_fp8.py +++ b/test/kernel/fuse_moe_tuning.py @@ -1,10 +1,12 @@ import os +import argparse import torch import time import torch.multiprocessing as mp from lightllm.common.fused_moe.grouped_fused_moe import fused_experts_impl, moe_align, moe_align1, grouped_matmul from typing import List from lightllm.utils.log_utils import init_logger +from transformers import AutoConfig logger = init_logger(__name__) @@ -58,14 +60,37 @@ def test_kernel( test_count: int, use_fp8_w8a8: bool, is_up: bool, + block_shape, **config, ): set_seed() input_tuples = [] a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 - w1 = torch.randn((expert_num, 2 * n, k), device="cuda", dtype=dtype) / 10 - w2 = torch.randn((expert_num, k, n), device="cuda", dtype=dtype) / 10 + w1_scale = w2_scale = None + + if use_fp8_w8a8: + init_dtype = dtype + w1 = torch.randn(expert_num, 2 * n, k, dtype=init_dtype).cuda() + w2 = torch.randn(expert_num, k, 2 * n // 2, dtype=init_dtype).cuda() + w1 = w1.to(torch.float8_e4m3fn) + w2 = w2.to(torch.float8_e4m3fn) + + if block_shape is None: + w1_scale = torch.randn(expert_num, dtype=torch.float32).cuda() + w2_scale = torch.randn(expert_num, dtype=torch.float32).cuda() + else: + block_n, block_k = block_shape[0], block_shape[1] + n_tiles_w1 = (2 * n + block_n - 1) // block_n + n_tiles_w2 = (k + block_n - 1) // block_n + k_tiles_w1 = (k + block_k - 1) // block_k + k_tiles_w2 = (2 * n // 2 + block_k - 1) // block_k + w1_scale = torch.rand((expert_num, n_tiles_w1, k_tiles_w1), dtype=torch.float32).cuda() + w2_scale = torch.rand((expert_num, n_tiles_w2, k_tiles_w2), dtype=torch.float32).cuda() + else: + w1 = torch.randn(expert_num, 2 * n, k, dtype=dtype).cuda() + w2 = torch.randn(expert_num, k, 2 * n // 2, dtype=dtype).cuda() + rnd_logics = torch.randn(m, expert_num, device="cuda") topk_values, topk_ids = torch.topk(rnd_logics, topk, dim=1) topk_weights = torch.randn((m, topk), device="cuda", dtype=dtype) / 10 @@ -75,12 +100,6 @@ def test_kernel( moe_align(topk_ids=topk_ids, out=expert_to_tokens) expert_to_token_num = torch.empty((expert_num,), dtype=torch.int32, device="cuda") moe_align1(expert_to_tokens, topk_weights, expert_to_weights, expert_to_token_num, topk=topk) - if use_fp8_w8a8: - w1, w1_scale = quantize_moe(w1) - w2, w2_scale = quantize_moe(w2) - else: - w1_scale = torch.empty((0,)) - w2_scale = torch.empty((0,)) out1 = torch.zeros((m * topk, 2 * n), dtype=torch.bfloat16, device="cuda") down_in = torch.zeros((m * topk, n), dtype=torch.bfloat16, device="cuda") @@ -142,6 +161,7 @@ def test_kernel( a, w1, w2, w1_scale, w2_scale, topk_ids, topk_weights, out1, out2, down_in = input_tuples[index] if is_up: grouped_matmul( + topk_ids.numel(), a, None, expert_to_token_num, @@ -158,6 +178,7 @@ def test_kernel( ) else: grouped_matmul( + topk_ids.numel(), down_in, None, expert_to_token_num, @@ -197,6 +218,7 @@ def worker( test_count: int, use_fp8_w8a8: bool, is_up: bool, + block_shape, test_configs, queue, ): @@ -212,6 +234,7 @@ def worker( test_count=test_count, use_fp8_w8a8=use_fp8_w8a8, is_up=is_up, + block_shape=block_shape, **test_configs[index], ) queue.put(cost_time) # Put result in queue @@ -278,6 +301,7 @@ def tuning_configs( test_count: int, use_fp8_w8a8: bool, is_up: bool, + block_shape, ): os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id) best_config, best_cost_time = None, 10000000 @@ -300,6 +324,7 @@ def tuning_configs( test_count, use_fp8_w8a8, is_up, + block_shape, test_configs, queue, ), @@ -333,6 +358,7 @@ def tuning_configs( test_count, use_fp8_w8a8, is_up, + block_shape, test_configs, queue, ), @@ -358,16 +384,30 @@ def tuning_configs( return best_config, best_cost_time -if __name__ == "__main__": +def main(args): torch.multiprocessing.set_start_method("spawn") from lightllm.utils.tuning_utils import mp_tuning from lightllm.common.fused_moe.moe_kernel_configs import MoeGroupedGemmKernelConfig - # tuning to get deepseekv2 large configs and store in H800, tp 8 - expert_num = 160 - n = 192 # up is n * 2 - hidden_dim = 5120 - topk_num = 6 + config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True) + if config.architectures[0] == "Qwen3MoeForCausalLM": + expert_num = config.num_experts + topk_num = config.num_experts_per_tok + n = 2 * config.moe_intermediate_size // args.tp + elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: + expert_num = config.n_routed_experts + topk_num = config.num_experts_per_tok + n = 2 * config.moe_intermediate_size // args.tp + else: + pass + + hidden_dim = getattr(config, "hidden_size", None) or config.text_config.hidden_size + use_fp8_w8a8 = args.use_fp8_w8a8 + block_shape = None + if hasattr(config, "quantization_config") and "weight_block_size" in config.quantization_config: + block_shape = config.quantization_config["weight_block_size"] + assert len(block_shape) == 2 + use_fp8_w8a8 = True up_dict = {} for m in [1, 8, 64, 128, 256, 512, 1024, 4096, 8192]: @@ -381,8 +421,9 @@ def tuning_configs( "topk": topk_num, "dtype": torch.bfloat16, "test_count": 20, - "use_fp8_w8a8": True, + "use_fp8_w8a8": use_fp8_w8a8, "is_up": True, + "block_shape": block_shape, }, ) up_dict[m] = ans @@ -392,7 +433,7 @@ def tuning_configs( topk_num=topk_num, expert_num=expert_num, mul_routed_weight=False, - use_fp8_w8a8=True, + use_fp8_w8a8=use_fp8_w8a8, out_dtype=str(torch.bfloat16), config_json=up_dict, ) @@ -409,8 +450,9 @@ def tuning_configs( "topk": topk_num, "dtype": torch.bfloat16, "test_count": 20, - "use_fp8_w8a8": True, + "use_fp8_w8a8": use_fp8_w8a8, "is_up": False, + "block_shape": block_shape, }, ) down_dict[m] = ans @@ -421,7 +463,16 @@ def tuning_configs( topk_num=1, expert_num=expert_num, mul_routed_weight=True, - use_fp8_w8a8=True, + use_fp8_w8a8=use_fp8_w8a8, out_dtype=str(torch.bfloat16), config_json=down_dict, ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_dir", type=str, default="deepseek-ai/DeepSeek-R1") + parser.add_argument("--tp", type=int, default=8) + parser.add_argument("--use_fp8_w8a8", action="store_true") + args = parser.parse_args() + main(args) diff --git a/test/kernel/fuse_moe_tuning_bf16.py b/test/kernel/fuse_moe_tuning_bf16.py deleted file mode 100644 index 712f2ab29..000000000 --- a/test/kernel/fuse_moe_tuning_bf16.py +++ /dev/null @@ -1,423 +0,0 @@ -import os -import torch -import time -import torch.multiprocessing as mp -from lightllm.common.fused_moe.grouped_fused_moe import fused_experts_impl, moe_align, moe_align1, grouped_matmul -from typing import List -from lightllm.utils.log_utils import init_logger - -logger = init_logger(__name__) - - -def set_seed(): - import torch - import random - import numpy as np - - seed = 42 - torch.manual_seed(seed) - random.seed(seed) - np.random.seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - return - - -def quantize_moe(weight): - - from lightllm.utils.vllm_utils import vllm_ops - - assert ( - vllm_ops is not None - ), "vllm is not installed, you can't use the api of it. \ - You can solve it by running `pip install vllm`." - - num_experts = weight.shape[0] - qweights = [] - weight_scales = [] - qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda() - for i in range(num_experts): - qweight, weight_scale = vllm_ops.scaled_fp8_quant( - weight[i].contiguous().cuda(), scale=None, use_per_token_if_dynamic=False - ) - qweights[i] = qweight - weight_scales.append(weight_scale) - weight_scale = torch.cat(weight_scales, dim=0).reshape(-1) - return qweights, weight_scale - - -@torch.no_grad() -def test_kernel( - expert_num: int, - m: int, - n: int, - k: int, - topk: int, - dtype: torch.dtype, - test_count: int, - use_fp8_w8a8: bool, - is_up: bool, - **config, -): - set_seed() - input_tuples = [] - - a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 - w1 = torch.randn((expert_num, 2 * n, k), device="cuda", dtype=dtype) / 10 - w2 = torch.randn((expert_num, k, n), device="cuda", dtype=dtype) / 10 - rnd_logics = torch.randn(m, expert_num, device="cuda") - topk_values, topk_ids = torch.topk(rnd_logics, topk, dim=1) - topk_weights = torch.randn((m, topk), device="cuda", dtype=dtype) / 10 - - expert_to_tokens = torch.empty((expert_num, topk * m), dtype=torch.int32, device="cuda") - expert_to_weights = torch.empty((expert_num, topk * m), dtype=torch.float32, device="cuda") - moe_align(topk_ids=topk_ids, out=expert_to_tokens) - expert_to_token_num = torch.empty((expert_num,), dtype=torch.int32, device="cuda") - moe_align1(expert_to_tokens, topk_weights, expert_to_weights, expert_to_token_num, topk=topk) - if use_fp8_w8a8: - w1, w1_scale = quantize_moe(w1) - w2, w2_scale = quantize_moe(w2) - else: - w1_scale = torch.empty((0,)) - w2_scale = torch.empty((0,)) - - out1 = torch.zeros((m * topk, 2 * n), dtype=torch.bfloat16, device="cuda") - down_in = torch.zeros((m * topk, n), dtype=torch.bfloat16, device="cuda") - out2 = torch.zeros((m * topk, k), dtype=torch.bfloat16, device="cuda") - - for _ in range(test_count): - input_tuples.append( - ( - a.clone(), - w1.clone(), - w2.clone(), - w1_scale.clone(), - w2_scale.clone(), - topk_ids.clone(), - topk_weights.clone(), - out1.clone(), - out2.clone(), - down_in.clone(), - ) - ) - - if is_up: - grouped_matmul( - topk_ids.numel(), - a, - None, - expert_to_token_num, - expert_to_tokens, - expert_to_weights=expert_to_weights, - expert_weights=w1, - expert_to_weights_scale=w1_scale, - topk_num=topk, - out=out1, - mul_routed_weight=False, - use_fp8_w8a8=use_fp8_w8a8, - **config, - ) - else: - grouped_matmul( - topk_ids.numel(), - down_in, - None, - expert_to_token_num, - expert_to_tokens, - expert_to_weights=expert_to_weights, - expert_weights=w2, - expert_to_weights_scale=w2_scale, - topk_num=1, - out=out2, - mul_routed_weight=True, - use_fp8_w8a8=use_fp8_w8a8, - **config, - ) - - graph = torch.cuda.CUDAGraph() - - with torch.cuda.graph(graph): - for index in range(test_count): - a, w1, w2, w1_scale, w2_scale, topk_ids, topk_weights, out1, out2, down_in = input_tuples[index] - if is_up: - grouped_matmul( - a, - None, - expert_to_token_num, - expert_to_tokens, - expert_to_weights=expert_to_weights, - expert_weights=w1, - expert_to_weights_scale=w1_scale, - topk_num=topk, - out=out1, - expert_token_limit=2 ** 31 - 1, - mul_routed_weight=False, - use_fp8_w8a8=use_fp8_w8a8, - **config, - ) - else: - grouped_matmul( - down_in, - None, - expert_to_token_num, - expert_to_tokens, - expert_to_weights=expert_to_weights, - expert_weights=w2, - expert_to_weights_scale=w2_scale, - topk_num=1, - out=out2, - expert_token_limit=2 ** 31 - 1, - mul_routed_weight=True, - use_fp8_w8a8=use_fp8_w8a8, - **config, - ) - - graph.replay() - - torch.cuda.synchronize() - start = time.time() - graph.replay() - torch.cuda.synchronize() - - cost_time = (time.time() - start) * 1000 - - logger.info(str(config)) - logger.info(f"bf16 {m} cost time: {cost_time} ms") - return cost_time - - -def worker( - expert_num: int, - m: int, - n: int, - k: int, - topk: int, - dtype: torch.dtype, - test_count: int, - use_fp8_w8a8: bool, - is_up: bool, - test_configs, - queue, -): - try: - for index in range(len(test_configs)): - cost_time = test_kernel( - expert_num=expert_num, - m=m, - n=n, - k=k, - topk=topk, - dtype=dtype, - test_count=test_count, - use_fp8_w8a8=use_fp8_w8a8, - is_up=is_up, - **test_configs[index], - ) - queue.put(cost_time) # Put result in queue - - except Exception as ex: - logger.error(str(ex)) - logger.exception(str(ex)) - import sys - - sys.exit(-1) - pass - - -def get_test_configs(split_id, split_count): - index = 0 - for num_stages in range(1, 6): - for GROUP_SIZE_M in [ - 1, - 2, - 4, - 8, - ]: - for num_warps in [ - 2, - 4, - 8, - 16, - ]: - for BLOCK_SIZE_M in [ - 16, - 32, - 64, - 128, - ]: - for BLOCK_SIZE_N in [16, 32, 64, 128]: - for BLOCK_SIZE_K in [16, 32, 64, 128]: - t_config = { - "BLOCK_SIZE_M": BLOCK_SIZE_M, - "BLOCK_SIZE_N": BLOCK_SIZE_N, - "BLOCK_SIZE_K": BLOCK_SIZE_K, - "GROUP_SIZE_M": GROUP_SIZE_M, - "num_warps": num_warps, - "num_stages": num_stages, - } - if index % split_count == split_id: - yield t_config - index += 1 - else: - index += 1 - - -def tuning_configs( - device_id: int, # use for mult mp tunning - device_count: int, - expert_num: int, - m: int, - n: int, - k: int, - topk: int, - dtype: torch.dtype, - test_count: int, - use_fp8_w8a8: bool, - is_up: bool, -): - os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id) - best_config, best_cost_time = None, 10000000 - queue = mp.Queue() - test_configs = [] - for t_config in get_test_configs(device_id, device_count): - test_configs.append(t_config) - if len(test_configs) < 256: - continue - - p = mp.Process( - target=worker, - args=( - expert_num, - m, - n, - k, - topk, - dtype, - test_count, - use_fp8_w8a8, - is_up, - test_configs, - queue, - ), - ) - p.start() - p.join() - while len(test_configs) != 0: - try: - cost_time = queue.get_nowait() - logger.info(f"get {test_configs[0]} cost_time: {cost_time}") - if cost_time < best_cost_time: - best_config = test_configs[0] - best_cost_time = cost_time - logger.info(f"cur best : {best_config} {best_cost_time}") - del test_configs[0:1] - except: - del test_configs[0:16] - logger.info(f"cur best : {best_config} {best_cost_time}") - break - - while len(test_configs) != 0: - p = mp.Process( - target=worker, - args=( - expert_num, - m, - n, - k, - topk, - dtype, - test_count, - use_fp8_w8a8, - is_up, - test_configs, - queue, - ), - ) - p.start() - p.join() - - while len(test_configs) != 0: - try: - cost_time = queue.get_nowait() - logger.info(f"get {test_configs[0]} cost_time: {cost_time}") - if cost_time < best_cost_time: - best_config = test_configs[0] - best_cost_time = cost_time - logger.info(f"cur best : {best_config} {best_cost_time}") - del test_configs[0:1] - except: - del test_configs[0:16] - logger.info(f"cur best : {best_config} {best_cost_time}") - break - - logger.info(f"{best_config} best cost: {best_cost_time}") - return best_config, best_cost_time - - -if __name__ == "__main__": - torch.multiprocessing.set_start_method("spawn") - from lightllm.utils.tuning_utils import mp_tuning - from lightllm.common.fused_moe.moe_kernel_configs import MoeGroupedGemmKernelConfig - - # tuning to get deepseekv2 lite configs and store tp 1 - expert_num = 64 - n = 1408 # up is n * 2 - hidden_dim = 2048 - topk_num = 6 - - up_dict = {} - for m in [1, 8, 64, 128, 256, 512, 1024, 4096, 8192]: - ans = mp_tuning( - tuning_configs, - { - "expert_num": expert_num, - "m": m, - "n": n, - "k": hidden_dim, - "topk": topk_num, - "dtype": torch.bfloat16, - "test_count": 20, - "use_fp8_w8a8": False, - "is_up": True, - }, - ) - up_dict[m] = ans - MoeGroupedGemmKernelConfig.save_config( - N=n * 2, - K=hidden_dim, - topk_num=topk_num, - expert_num=expert_num, - mul_routed_weight=False, - use_fp8_w8a8=False, - out_dtype=str(torch.bfloat16), - config_json=up_dict, - ) - - down_dict = {} - for m in [1, 8, 64, 128, 256, 512, 1024, 4096, 8192]: - ans = mp_tuning( - tuning_configs, - { - "expert_num": expert_num, - "m": m, - "n": n, - "k": hidden_dim, - "topk": topk_num, - "dtype": torch.bfloat16, - "test_count": 20, - "use_fp8_w8a8": False, - "is_up": False, - }, - ) - down_dict[m] = ans - MoeGroupedGemmKernelConfig.save_config( - N=hidden_dim, - K=n, - topk_num=1, - expert_num=expert_num, - mul_routed_weight=True, - use_fp8_w8a8=False, - out_dtype=str(torch.bfloat16), - config_json=down_dict, - ) diff --git a/test/kernel/tuning/llama_gqa_decode_vsm_tuning.py b/test/kernel/llama_gqa_decode_vsm_tuning.py similarity index 100% rename from test/kernel/tuning/llama_gqa_decode_vsm_tuning.py rename to test/kernel/llama_gqa_decode_vsm_tuning.py diff --git a/test/model/test_script.sh b/test/model/test_script.sh deleted file mode 100755 index 985868b59..000000000 --- a/test/model/test_script.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/bash - -DATASET_PATH="/your/date/path" # 你的数据集路径 -FILE_PATH="/model/root" # 确保这里是所有模型的上级目录 -HOST="0.0.0.0" -NCCL_PORT=28000 -CUDA_LIST=(0 1 2 3) -PORT=8000 -MAX_PORT=65535 -NUM_PROMPTS=100 -REQUEST_RATE=20 - -test_models() { - local -a models=("${!1}") - local -a modes=("${!2}") - echo "models: ${models[@]}" - echo "modes: ${modes[@]}" - local model_num=${#models[@]} - local loop_num=${#modes[@]} - - for model in "${models[@]}"; do - local model_dir="${FILE_PATH}/${model}" - # export CUDA_VISIBLE_DEVICES=${CUDA_LIST[i]} - - for ((i = 0; i <= loop_num; i++)); do - local current_port=$PORT - local current_nccl_port=$((NCCL_PORT+i)) - - # 检查端口是否被占用 - while lsof -i:$current_nccl_port &>/dev/null || lsof -i:$current_port &>/dev/null; do - current_nccl_port=$((current_nccl_port+1)) - current_port=$((current_port+1)) - if [ "$current_port" -gt "$MAX_PORT" ] || [ "$current_nccl_port" -gt "$MAX_PORT" ]; then - echo "No available ports found." - exit 1 - fi - done - - echo "Start ${model_dir} on port ${current_port} with GPU ${CUDA_LIST[i]} and NCCL_PORT ${current_nccl_port} with mode ${modes[i]}" - if [ "$i" -eq 0 ]; then - nohup python -m lightllm.server.api_server --model_dir "${model_dir}" --host ${HOST} --port ${current_port} --tp 1 --trust_remote_code --nccl_port ${current_nccl_port} > server_output.log 2>&1 & - else - echo "idx:${i} with mode ${modes[i-1]}" - nohup python -m lightllm.server.api_server --model_dir "${model_dir}" --mode "${modes[i-1]}" --host ${HOST} --port ${current_port} --tp 1 --trust_remote_code --nccl_port ${current_nccl_port} > server_output.log 2>&1 & - fi - local server_pid=$! - - # 等待服务器启动并监控输出 - echo "Waiting for server to start..." - tail -f server_output.log | while read line; do - echo "${line}" - if [[ "${line}" == *"Uvicorn running on http://0.0.0.0"* ]]; then - echo "Server is ready. Starting the client..." - pkill -P $$ tail # 终止 tail 进程 继续执行后面的命令 - break - fi - done - - # 启动接收端程序 - echo "Starting the client to send requests..." - python test/benchmark_serving.py --tokenizer "${model_dir}" --dataset "${DATASET_PATH}" --num-prompts ${NUM_PROMPTS} --request-rate ${REQUEST_RATE} --port ${current_port} --model "${model}" - echo "Client finished." - - # 接收端程序完成后,关闭服务器 - echo "Shutting down the server: pid=${server_pid}" - kill "${server_pid}" - sleep 1 - # 检查进程是否仍然存在 - if ps -p "${server_pid}" > /dev/null; then # 尝试获取特定 PID 的进程信息 - echo "The server is still running." - kill -9 "${server_pid}" - else - echo "The server has been stopped." - fi - done - done -} - -# 示例调用 -MODEL_ARRAY_LLAMA=("llama2-13b-chat") -MODE_ARRAY_LLAMA=("triton_int8weight" "triton_int4weight") -test_models MODEL_ARRAY_LLAMA[@] MODE_ARRAY_LLAMA[@] \ No newline at end of file diff --git a/test/model/test_settings/model_infer_batchs.py b/test/model/test_settings/model_infer_batchs.py deleted file mode 100644 index 4a780ac69..000000000 --- a/test/model/test_settings/model_infer_batchs.py +++ /dev/null @@ -1,223 +0,0 @@ -import os -import numpy as np -from multiprocessing import Queue -import multiprocessing - - -def test_model_inference(world_size, model_dir, model_class, batch_sizes, input_len, output_len, mode, log_path): - ans_queue = Queue() - workers = [] - for rank_id in range(world_size): - model_kvargs = { - "run_mode": "normal", - "tp_rank": rank_id, - "world_size": world_size, - "weight_dir": model_dir, - "max_total_token_num": None, - "mem_faction": 0.8, - "load_way": "HF", - "batch_max_tokens": (input_len + output_len), - "mode": mode, - "max_req_num": max(batch_sizes), - "graph_max_batch_size": max(batch_sizes), - "graph_max_len_in_batch": (input_len + output_len), - "max_seq_length": (input_len + output_len), - } - - proc = multiprocessing.Process( - target=tppart_model_infer, - args=(model_class, model_kvargs, batch_sizes, input_len, output_len, ans_queue, log_path), - ) - proc.start() - workers.append(proc) - - while True: - import time - - exist_dead = any([not proc.is_alive() for proc in workers]) - if exist_dead: - time.sleep(4) - exist_err = any([proc.exitcode != 0 for proc in workers]) - if exist_err: - return -1 - else: - break - time.sleep(1) - - while not ans_queue.empty(): - if not ans_queue.get(): - return -1 - return 0 - - -def tppart_model_infer(model_class, model_kvargs, batch_sizes, input_len, output_len, ans_queue, log_path): - assert log_path is not None - need_run_batch_sizes = [] - for batch_size in batch_sizes: - new_log_path = log_path.replace("batch_size", str(batch_size)) - if os.path.exists(new_log_path): - with open(new_log_path, "r") as fp_file: - lines = fp_file.readlines() - if len(lines) >= 2 and "time total cost(ms):" in lines[-1]: # 说明已经跑过了,直接过滤掉。 - continue - else: - need_run_batch_sizes.append(batch_size) - else: - need_run_batch_sizes.append(batch_size) - - if len(need_run_batch_sizes) == 0: - return - - import torch - import torch.distributed as dist - - rank_id = model_kvargs["tp_rank"] - world_size = model_kvargs["world_size"] - - torch.cuda.set_device(rank_id) - dist.init_process_group("nccl", init_method="tcp://127.0.0.1:28765", rank=rank_id, world_size=world_size) - dist.barrier() - - torch.cuda.empty_cache() - - model_part = model_class(model_kvargs) - - for batch_size in need_run_batch_sizes: - model_part.mem_manager.free_all() - model_part.req_manager.free_all() - model_part.mem_manager.resize_mem(batch_size * (input_len + output_len)) - # warm up - test_data = np.vstack([np.arange(5, input_len + 5) for _ in range(batch_size)]) - test_data = test_data.reshape(-1) - test_data = torch.from_numpy(test_data).cuda() - - b_req_idx = model_part.req_manager.alloc(batch_size).int() - b_seq_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda") - for i in range(batch_size): - b_seq_len[i] = input_len - - total_token_num = input_len * batch_size - mem_indexes = model_part.req_manager.mem_manager.alloc(test_data.shape[0]) - logics = model_part.forward( - batch_size, - total_token_num, - input_len, - test_data, - mem_indexes, - b_req_idx, - b_seq_len, - is_prefill=True, - ) - prob_out = torch.softmax(logics, dim=-1) - predict_ids = torch.argmax(prob_out, dim=1, keepdim=True) - predict_ids = predict_ids.detach().cpu().numpy() - - for i in range(output_len): - total_token_num += batch_size - b_seq_len += 1 - mem_indexes = model_part.req_manager.mem_manager.alloc(predict_ids.shape[0]) - logics = model_part.forward( - batch_size, - total_token_num, - input_len + i + 1, - torch.from_numpy(predict_ids).cuda().reshape(-1), - mem_indexes, - b_req_idx, - b_seq_len, - is_prefill=False, - ) - prob_out = torch.softmax(logics, dim=-1) - predict_ids = torch.argmax(prob_out, dim=1, keepdim=True) - predict_ids = predict_ids.detach().cpu().numpy() - - model_part.mem_manager.free_all() - model_part.req_manager.free_all() - - if rank_id == 0: - print("can use mem size:", model_part.mem_manager.can_use_mem_size) - print("can use req size:", model_part.req_manager.can_use_req_size) - - b_req_idx = None - b_seq_len = None - - dist.barrier() - if rank_id == 0: - new_log_path = log_path.replace("batch_size", str(batch_size)) - fp_file = open(new_log_path, "w+") - - import time - - torch.cuda.synchronize() - start_time = time.time() - - prefill_start_time = time.time() - - b_req_idx = model_part.req_manager.alloc(batch_size).int() - b_seq_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda") - for i in range(batch_size): - b_seq_len[i] = input_len - - total_token_num = batch_size * input_len - mem_indexes = model_part.req_manager.mem_manager.alloc(test_data.shape[0]) - logics = model_part.forward( - batch_size, - total_token_num, - input_len, - test_data, - mem_indexes, - b_req_idx, - b_seq_len, - is_prefill=True, - ) - prob_out = torch.softmax(logics, dim=-1) - predict_ids = torch.argmax(prob_out, dim=1, keepdim=True) - predict_ids = predict_ids.detach().cpu().numpy() - - torch.cuda.synchronize() - if rank_id == 0: - print("prefill time cost:", (time.time() - prefill_start_time) * 1000, file=fp_file) - - for i in range(output_len): - torch.cuda.synchronize() - step_start = time.time() - total_token_num += batch_size - b_seq_len += 1 - mem_indexes = model_part.req_manager.mem_manager.alloc(predict_ids.shape[0]) - logics = model_part.forward( - batch_size, - total_token_num, - input_len + i + 1, - torch.from_numpy(predict_ids).cuda().reshape(-1), - mem_indexes, - b_req_idx, - b_seq_len, - is_prefill=False, - ) - prob_out = torch.softmax(logics, dim=-1) - predict_ids = torch.argmax(prob_out, dim=1, keepdim=True) - predict_ids = predict_ids.detach().cpu().numpy() - torch.cuda.synchronize() - if i % 100 == 0 or i == output_len - 1: - if rank_id == 0: - print(i, "step cost time:", (time.time() - step_start) * 1000, file=fp_file) - - torch.cuda.synchronize() - end_time = time.time() - - if rank_id == 0: - print("time total cost(ms):", (end_time - start_time) * 1000, file=fp_file) - import sys - - if fp_file is not sys.stdout: - fp_file.flush() - fp_file.close() - while not fp_file.closed: - fp_file.close() - - b_req_idx = None - b_seq_len = None - test_data = None - - ans_queue.put(True) - - return diff --git a/test/model/test_settings/process_utils.py b/test/model/test_settings/process_utils.py deleted file mode 100644 index 352e6f03d..000000000 --- a/test/model/test_settings/process_utils.py +++ /dev/null @@ -1,33 +0,0 @@ -import subprocess -import re - - -def kill_gpu_processes(): - try: - output = subprocess.check_output(["nvidia-smi", "-q", "-x"]) - output = output.decode("utf-8") - - # 使用正则表达式提取进程信息 - process_info = re.findall(r"(.*?)", output, re.DOTALL) - - if process_info: - print("找到以下占用显卡的进程:") - for info in process_info: - pid = re.search(r"(.*?)", info).group(1) - process_name = re.search(r"(.*?)", info).group(1) - print("进程ID:", pid) - print("进程名字:", process_name) - - for info in process_info: - pid = re.search(r"(.*?)", info).group(1) - subprocess.call(["sudo", "kill", "-9", pid]) - print("进程ID", pid, "被终止") - else: - print("没有找到占用显卡的进程") - - except subprocess.CalledProcessError: - print("无法执行nvidia-smi命令") - - -if __name__ == "__main__": - kill_gpu_processes() diff --git a/test/model/test_settings/test_settings.py b/test/model/test_settings/test_settings.py deleted file mode 100644 index 2890c0c18..000000000 --- a/test/model/test_settings/test_settings.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -import sys -from model_infer_batchs import test_model_inference -from process_utils import kill_gpu_processes - -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) -from datetime import datetime - - -from lightllm.models.bloom.model import BloomTpPartModel -from lightllm.models.llama.model import LlamaTpPartModel -from lightllm.models.starcoder.model import StarcoderTpPartModel -from lightllm.models.qwen.model import QWenTpPartModel -from lightllm.models.chatglm2.model import ChatGlm2TpPartModel -from lightllm.models.internlm.model import InternlmTpPartModel - - -base_dir = "/nvme/" - -model_to_class_and_path = { - "llama-7b": (LlamaTpPartModel, os.path.join(base_dir, "llama-7b")), - "llama-13b": (LlamaTpPartModel, os.path.join(base_dir, "")), - "internal-20b": (InternlmTpPartModel, os.path.join(base_dir, "")), - "llama-65b": (LlamaTpPartModel, os.path.join(base_dir, "")), - "llama2-70b": (LlamaTpPartModel, os.path.join(base_dir, "")), - "chatglm2-6b": (ChatGlm2TpPartModel, os.path.join(base_dir, "")), -} - - -def test_all_setting(gpu_name, model_name, mode, log_dir, world_sizes, in_out_lens, batch_sizes): - log_dir = os.path.join(log_dir, gpu_name, str(model_name)) - os.makedirs(log_dir, exist_ok=True) - - model_class, model_path = model_to_class_and_path[model_name] - kill_gpu_processes() - for world_size in world_sizes: - for in_len, out_len in in_out_lens: - kill_gpu_processes() - mode_str = "_".join(mode) - log_file_name = f"{model_name}##{mode_str}##{world_size}##{in_len}##{out_len}##batch_size##.log" - log_path = os.path.join(log_dir, log_file_name) - print(log_path) - test_model_inference(world_size, model_path, model_class, batch_sizes, in_len, out_len, mode, log_path) - log_md_file = log_dir + ".md" - md_file = open(log_md_file, "w") - # write head - heads = [ - "mode", - "world_size", - "batch_size", - "input_len", - "output_len", - "prefill_cost", - "first_step_latency", - "last_step_latency", - "mean_latency", - "prefill_throughput", - "decode_throughput", - "total_throughput", - "card_num_per_qps", - ] - md_file.write(f"test model: {model_name} \r\n") - md_file.write("|") - for head in heads: - md_file.write(head + "|") - md_file.write("\r\n") - md_file.write("|") - for _ in range(len(heads)): - md_file.write("------|") - md_file.write("\r\n") - log_files = list(os.listdir(log_dir)) - sorted(log_files, key=lambda x: tuple(map(int, x.split("##")[2:6]))) - for log_file in log_files: - _, mode, world_size, input_len, output_len, batch_size, _ = log_file.split("##") - fp_file = open(os.path.join(log_dir, log_file), "r") - all_lines = fp_file.readlines() - fp_file.close() - if len(all_lines) <= 2: - continue - prefill_cost = float(all_lines[0].split(":")[1].strip()) - firststep_cost = float(all_lines[1].split(":")[1].strip()) - laststep_cost = float(all_lines[-2].split(":")[1].strip()) - all_step_cost = float(all_lines[-1].split(":")[1].strip()) - mean_step_cost = (all_step_cost - prefill_cost) / float(output_len) - card_num_per_qps = float(world_size) / (float(batch_size) / (all_step_cost / 1000)) - prefill_throughput = float(batch_size) * float(input_len) / (prefill_cost / 1000) - decode_throughput = float(batch_size) * float(output_len) / ((all_step_cost - prefill_cost) / 1000) - total_throughput = float(batch_size) * (float(input_len) + float(output_len)) / (all_step_cost / 1000) - md_file.write("|") - infos = [ - mode, - world_size, - batch_size, - input_len, - output_len, - prefill_cost, - firststep_cost, - laststep_cost, - mean_step_cost, - prefill_throughput, - decode_throughput, - total_throughput, - card_num_per_qps, - ] - for info in infos: - md_file.write(str(format(info, ".4f")) if isinstance(info, float) else str(info)) - md_file.write("|") - md_file.write("\r\n") - md_file.close() - - -gpu_name = "A800" -in_out_lens = [(128, 128), (256, 256)] # in_out_lens 中的数据必须以从短到长的顺序排列,否则可能有问题。 -batch_sizes = [1, 2] # batch_sizes 中的数字也必须从小到大排列。 - - -if __name__ == "__main__": - import torch - - torch.multiprocessing.set_start_method("spawn") - - test_all_setting( - gpu_name, - "llama3-8b", - # mode=["triton_int8weight", "ppl_fp16_flashdecoding"], # mode 为 【】 为普通 fp16 的格式。 - mode=["triton_gqa_flashdecoding"], - log_dir="./", - world_sizes=[1], - in_out_lens=in_out_lens, - batch_sizes=batch_sizes, - ) diff --git a/test/server/readme.md b/test/server/readme.md deleted file mode 100644 index 0b8d53903..000000000 --- a/test/server/readme.md +++ /dev/null @@ -1,15 +0,0 @@ -# prompt cache 测试: - -- benchmark_prompt_cache.py: 单次测试脚本。 - - 例子: - ```shell - python benchmark_prompt_cache.py --address http://localhost:8090 --model_name llama --num_workers 1 --first_input_len 512 --subsequent_input_len 32 --output_len 32 --num_turns 5 --num_users 1 - ``` - - 使用方法详细说明: - ```shell - python benchmark_prompt_cache.py -h - ``` - -- test_settings.py: 批量测试脚本,可测试多个配置并汇总为md diff --git a/test/start_scripts/README.md b/test/start_scripts/README.md new file mode 100644 index 000000000..aff1d973f --- /dev/null +++ b/test/start_scripts/README.md @@ -0,0 +1,202 @@ +# LightLLM DeepSeek Model Startup Scripts + +This directory contains various startup scripts for deploying DeepSeek models with LightLLM, covering different deployment modes and hardware configurations. + +## Script Categories + +### Single Node Deployment Scripts + +- `single_node_tp.sh` - Single node tensor parallelism (TP) mode +- `single_node_ep.sh` - Single node expert parallelism (EP) mode + +### Multi-Node Deployment Scripts + +- `multi_node_tp_node0.sh` - Multi-node tensor parallelism node 0 +- `multi_node_tp_node1.sh` - Multi-node tensor parallelism node 1 +- `multi_node_ep_node0.sh` - Multi-node expert parallelism node 0 +- `multi_node_ep_node1.sh` - Multi-node expert parallelism node 1 + +### PD Separated Deployment Scripts + +#### Single PD Master Mode +- `single_pd_master/pd_master.sh` - PD Master service +- `single_pd_master/pd_prefill.sh` - Prefill service +- `single_pd_master/pd_decode.sh` - Decode service + +#### Multi PD Master Mode +- `multi_pd_master/config_server.sh` - Configuration server +- `multi_pd_master/pd_master_1.sh` - PD Master 1 +- `multi_pd_master/pd_master_2.sh` - PD Master 2 +- `multi_pd_master/pd_prefill.sh` - Prefill service +- `multi_pd_master/pd_decode.sh` - Decode service + +## Usage Instructions + +### 1. Single Node TP Mode + +```bash +# Modify model path and run directly +sh single_node_tp.sh +``` + +### 2. Single Node EP Mode + +```bash +# Modify model path and run directly +sh single_node_ep.sh +``` + +### 3. Multi-Node TP Mode + +```bash +# Run on node 0 +sh multi_node_tp_node0.sh + +# Run on node 1 +sh multi_node_tp_node1.sh +``` + +### 4. Multi-Node EP Mode + +```bash +# Run on node 0 +sh multi_node_ep_node0.sh + +# Run on node 1 +sh multi_node_ep_node1.sh +``` + +### 5. Single PD Master Mode + +```bash +# Step 1: Start PD Master +sh single_pd_master/pd_master.sh + +# Step 2: Start Prefill service +sh single_pd_master/pd_prefill.sh + +# Step 3: Start Decode service +sh single_pd_master/pd_decode.sh +``` + +### 6. Multi PD Master Mode + +```bash +# Step 1: Start configuration server +sh multi_pd_master/config_server.sh + +# Step 2: Start multiple PD Masters +sh multi_pd_master/pd_master_1.sh +sh multi_pd_master/pd_master_2.sh + +# Step 3: Start Prefill and Decode services +sh multi_pd_master/pd_prefill.sh +sh multi_pd_master/pd_decode.sh +``` + +## Configuration Guide + +### Environment Variables + +- `LOADWORKER`: Model loading thread count, recommended 8-18 +- `MOE_MODE`: Expert parallelism mode, set to EP to enable expert parallelism +- `KV_TRANS_USE_P2P`: Enable P2P communication optimization +- `CUDA_VISIBLE_DEVICES`: Specify GPU devices to use + +### Important Parameters + +- `--model_dir`: Model file path +- `--tp`: Tensor parallelism degree +- `--dp`: Data parallelism degree +- `--enable_fa3`: Enable Flash Attention 3.0 +- `--nnodes`: Total number of nodes +- `--node_rank`: Current node rank +- `--nccl_host`: NCCL communication host address +- `--nccl_port`: NCCL communication port + +## Hardware Configuration Recommendations + +### H200 Single Node +- Recommended 8 GPUs, TP=8 +- Memory: At least 128GB system memory + +### H100 Dual Node +- Recommended 16 GPUs, TP=16 +- Network: High bandwidth, low latency network connection + +### General Recommendations +- Ensure GPU drivers and CUDA versions are compatible +- Check network connectivity and firewall settings +- Monitor GPU utilization and memory usage + +## Troubleshooting + +### Common Issues + +1. **NCCL Communication Errors** + - Check network connectivity + - Verify firewall settings + - Validate IP address configuration + +2. **Insufficient GPU Memory** + - Reduce batch_size + - Use more GPUs + - Enable KV cache optimization + +3. **Model Loading Failures** + - Check model path + - Verify file integrity + - Confirm permission settings + +### Performance Optimization + +1. **Enable MPS Service** + ```bash + nvidia-cuda-mps-control -d + ``` + +2. **Enable Micro-batch Overlap** + ```bash + --enable_prefill_microbatch_overlap + --enable_decode_microbatch_overlap + ``` + +3. **Adjust CUDA Graph Parameters** + ```bash + --graph_max_batch_size 100 + ``` + +## Testing and Validation + +### Basic Functionality Test + +```bash +curl http://server_ip:server_port/generate \ + -H "Content-Type: application/json" \ + -d '{ + "inputs": "What is AI?", + "parameters":{ + "max_new_tokens":17, + "frequency_penalty":1 + } + }' +``` + +### Performance Benchmark Test + +```bash +cd test +python benchmark_client.py \ +--num_clients 100 \ +--input_num 2000 \ +--tokenizer_path /path/DeepSeek-R1/ \ +--url http://127.0.0.1:8088/generate_stream +``` + +## Important Notes + +1. Please modify the model path in scripts before use +2. Adjust parameters according to actual hardware configuration +3. Ensure network environment meets multi-node deployment requirements +4. Recommend thorough testing before production deployment +5. Regularly monitor service status and performance metrics \ No newline at end of file diff --git a/test/start_scripts/multi_node_ep_node0.sh b/test/start_scripts/multi_node_ep_node0.sh new file mode 100644 index 000000000..3a139968a --- /dev/null +++ b/test/start_scripts/multi_node_ep_node0.sh @@ -0,0 +1,16 @@ +# H200 multi node deepseek R1 ep mode node 0 +# nccl_host: the ip of the nccl host +# sh multi_node_ep_node0.sh +export nccl_host=$1 +MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ +--model_dir /path/DeepSeek-R1 \ +--tp 16 \ +--dp 16 \ +--enable_fa3 \ +--nnodes 2 \ +--node_rank 0 \ +--nccl_host $nccl_host \ +--nccl_port 2732 +# if you want to enable microbatch overlap, you can uncomment the following lines +#--enable_prefill_microbatch_overlap +#--enable_decode_microbatch_overlap \ No newline at end of file diff --git a/test/start_scripts/multi_node_ep_node1.sh b/test/start_scripts/multi_node_ep_node1.sh new file mode 100644 index 000000000..b24a59868 --- /dev/null +++ b/test/start_scripts/multi_node_ep_node1.sh @@ -0,0 +1,16 @@ +# H200 multi node deepseek R1 ep mode node 1 +# nccl_host: the ip of the nccl host +# sh multi_node_ep_node1.sh +export nccl_host=$1 +MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ +--model_dir /path/DeepSeek-R1 \ +--tp 16 \ +--dp 16 \ +--enable_fa3 \ +--nnodes 2 \ +--node_rank 1 \ +--nccl_host $nccl_host \ +--nccl_port 2732 +# if you want to enable microbatch overlap, you can uncomment the following lines +#--enable_prefill_microbatch_overlap +#--enable_decode_microbatch_overlap \ No newline at end of file diff --git a/test/start_scripts/multi_node_tp_node0.sh b/test/start_scripts/multi_node_tp_node0.sh new file mode 100644 index 000000000..b86bdeb35 --- /dev/null +++ b/test/start_scripts/multi_node_tp_node0.sh @@ -0,0 +1,12 @@ +# H200/H100 multi node deepseek R1 tp mode node 0 +# nccl_host: the ip of the nccl host +# sh multi_node_tp_node0.sh +export nccl_host=$1 +LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ +--model_dir /path/DeepSeek-R1 \ +--tp 16 \ +--enable_fa3 \ +--nnodes 2 \ +--node_rank 0 \ +--nccl_host $nccl_host \ +--nccl_port 2732 \ No newline at end of file diff --git a/test/start_scripts/multi_node_tp_node1.sh b/test/start_scripts/multi_node_tp_node1.sh new file mode 100644 index 000000000..378977ab2 --- /dev/null +++ b/test/start_scripts/multi_node_tp_node1.sh @@ -0,0 +1,12 @@ +# H200/H100 multi node deepseek R1 tp mode node 1 +# nccl_host: the ip of the nccl host +# sh multi_node_tp_node1.sh +export nccl_host=$1 +LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ +--model_dir /path/DeepSeek-R1 \ +--tp 16 \ +--enable_fa3 \ +--nnodes 2 \ +--node_rank 1 \ +--nccl_host $nccl_host \ +--nccl_port 2732 \ No newline at end of file diff --git a/test/start_scripts/multi_pd_master.sh b/test/start_scripts/multi_pd_master.sh new file mode 100644 index 000000000..c4e8c21fb --- /dev/null +++ b/test/start_scripts/multi_pd_master.sh @@ -0,0 +1,34 @@ +# 多 pd_master 节点部署示例 +python -m lightllm.server.api_server --run_mode "config_server" --config_server_host 10.120.114.74 --config_server_port 60088 + +python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60011 --config_server_host 10.120.114.74 --config_server_port 60088 + +python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60012 --config_server_host 10.120.114.74 --config_server_port 60088 + +nvidia-cuda-mps-control -d +CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \ +--run_mode "prefill" \ +--host 10.120.178.74 \ +--port 8019 \ +--tp 1 \ +--nccl_port 2732 \ +--max_total_token_num 40000 \ +--tokenizer_mode fast \ +--max_req_total_len 16000 \ +--running_max_req_size 128 \ +--disable_cudagraph \ +--config_server_host 10.120.114.74 \ +--config_server_port 60088 + +CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \ +--run_mode "decode" \ +--host 10.120.178.74 \ +--port 8121 \ +--nccl_port 12322 \ +--tp 1 \ +--max_total_token_num 40000 \ +--graph_max_len_in_batch 2048 \ +--graph_max_batch_size 16 \ +--tokenizer_mode fast \ +--config_server_host 10.120.114.74 \ +--config_server_port 60088 \ No newline at end of file diff --git a/test/start_scripts/multi_pd_master/config_server.sh b/test/start_scripts/multi_pd_master/config_server.sh new file mode 100644 index 000000000..3771cd1cd --- /dev/null +++ b/test/start_scripts/multi_pd_master/config_server.sh @@ -0,0 +1,5 @@ +# config_server +# config_server_host: the host of the config server +# sh config_server.sh +export config_server_host=$1 +python -m lightllm.server.api_server --run_mode "config_server" --config_server_host $config_server_host --config_server_port 60088 diff --git a/test/start_scripts/multi_pd_master/pd_decode.sh b/test/start_scripts/multi_pd_master/pd_decode.sh new file mode 100644 index 000000000..4cefef6fb --- /dev/null +++ b/test/start_scripts/multi_pd_master/pd_decode.sh @@ -0,0 +1,20 @@ +# decode +# host: the host of the decode server +# config_server_host: the host of the config server +# sh decode.sh +export host=$1 +export config_server_host=$2 +nvidia-cuda-mps-control -d +MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \ +--model_dir /path/DeepSeek-R1 \ +--run_mode "decode" \ +--host $host \ +--port 8121 \ +--nccl_port 12322 \ +--tp 8 \ +--dp 8 \ +--enable_fa3 \ +--config_server_host $config_server_host \ +--config_server_port 60088 +# if you want to enable microbatch overlap, you can uncomment the following lines +#--enable_decode_microbatch_overlap \ No newline at end of file diff --git a/test/start_scripts/multi_pd_master/pd_master_1.sh b/test/start_scripts/multi_pd_master/pd_master_1.sh new file mode 100644 index 000000000..b71024525 --- /dev/null +++ b/test/start_scripts/multi_pd_master/pd_master_1.sh @@ -0,0 +1,7 @@ +# pd_master 1 +# host: the host of the pd master +# config_server_host: the host of the config server +# sh pd_master_1.sh +export host=$1 +export config_server_host=$2 +python -m lightllm.server.api_server --model_dir /path/DeepSeek-R1 --run_mode "pd_master" --host $host --port 60011 --config_server_host $config_server_host --config_server_port 60088 diff --git a/test/start_scripts/multi_pd_master/pd_master_2.sh b/test/start_scripts/multi_pd_master/pd_master_2.sh new file mode 100644 index 000000000..f3a474d95 --- /dev/null +++ b/test/start_scripts/multi_pd_master/pd_master_2.sh @@ -0,0 +1,7 @@ +# pd_master 2 +# host: the host of the pd master +# config_server_host: the host of the config server +# sh pd_master_2.sh +export host=$1 +export config_server_host=$2 +python -m lightllm.server.api_server --model_dir /path/DeepSeek-R1 --run_mode "pd_master" --host $host --port 60012 --config_server_host $config_server_host --config_server_port 60088 diff --git a/test/start_scripts/multi_pd_master/pd_prefill.sh b/test/start_scripts/multi_pd_master/pd_prefill.sh new file mode 100644 index 000000000..b845da435 --- /dev/null +++ b/test/start_scripts/multi_pd_master/pd_prefill.sh @@ -0,0 +1,21 @@ +# prefill +# host: the host of the prefill server +# config_server_host: the host of the config server +# sh pd_prefill.sh +export host=$1 +export config_server_host=$2 +nvidia-cuda-mps-control -d +MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \ +--model_dir /path/DeepSeek-R1 \ +--run_mode "prefill" \ +--host $host \ +--port 8019 \ +--tp 8 \ +--dp 8 \ +--nccl_port 2732 \ +--enable_fa3 \ +--disable_cudagraph \ +--config_server_host $config_server_host \ +--config_server_port 60088 +# if you want to enable microbatch overlap, you can uncomment the following lines +#--enable_prefill_microbatch_overlap \ No newline at end of file diff --git a/test/start_scripts/single_node_ep.sh b/test/start_scripts/single_node_ep.sh new file mode 100644 index 000000000..cad172d51 --- /dev/null +++ b/test/start_scripts/single_node_ep.sh @@ -0,0 +1,9 @@ +# H200 single node deepseek R1 dpep mode +MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ +--model_dir /path/DeepSeek-R1 \ +--tp 8 \ +--dp 8 \ +--enable_fa3 +# if you want to enable microbatch overlap, you can uncomment the following lines +#--enable_prefill_microbatch_overlap \ +#--enable_decode_microbatch_overlap \ diff --git a/test/start_scripts/single_node_tp.sh b/test/start_scripts/single_node_tp.sh new file mode 100644 index 000000000..1fb461bb1 --- /dev/null +++ b/test/start_scripts/single_node_tp.sh @@ -0,0 +1,8 @@ +# H200 single node deepseek R1 tp mode +LOADWORKER=18 python -m lightllm.server.api_server --port 8088 \ +--model_dir /path/DeepSeek-R1 \ +--tp 8 \ +--enable_fa3 +# if you want to enable microbatch overlap, you can uncomment the following lines +#--enable_prefill_microbatch_overlap \ +#--enable_decode_microbatch_overlap \ diff --git a/test/start_scripts/single_pd_master/pd_decode.sh b/test/start_scripts/single_pd_master/pd_decode.sh new file mode 100644 index 000000000..3bef53875 --- /dev/null +++ b/test/start_scripts/single_pd_master/pd_decode.sh @@ -0,0 +1,21 @@ +# PD decode mode for deepseek R1 (DP+EP) on H200 +# host: the host of the current node +# pd_master_ip: the ip of the pd master +# sh pd_decode.sh +export host=$1 +export pd_master_ip=$2 +nvidia-cuda-mps-control -d +MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \ +--model_dir /path/DeepSeek-R1 \ +--run_mode "decode" \ +--tp 8 \ +--dp 8 \ +--host $host \ +--port 8121 \ +--nccl_port 12322 \ +--enable_fa3 \ +--disable_cudagraph \ +--pd_master_ip $pd_master_ip \ +--pd_master_port 60011 +# if you want to enable microbatch overlap, you can uncomment the following lines +#--enable_decode_microbatch_overlap \ No newline at end of file diff --git a/test/start_scripts/single_pd_master/pd_master.sh b/test/start_scripts/single_pd_master/pd_master.sh new file mode 100644 index 000000000..600ef90b7 --- /dev/null +++ b/test/start_scripts/single_pd_master/pd_master.sh @@ -0,0 +1,5 @@ +# pd_master for deepseek R1 +# pd_master_ip: the ip of the pd master +# sh pd_master.sh +export pd_master_ip=$1 +python -m lightllm.server.api_server --model_dir /path/DeepSeek-R1 --run_mode "pd_master" --host $pd_master_ip --port 60011 \ No newline at end of file diff --git a/test/start_scripts/single_pd_master/pd_prefill.sh b/test/start_scripts/single_pd_master/pd_prefill.sh new file mode 100644 index 000000000..b15e4ef70 --- /dev/null +++ b/test/start_scripts/single_pd_master/pd_prefill.sh @@ -0,0 +1,21 @@ +# PD prefill mode for deepseek R1 (DP+EP) on H200 +# host: the host of the current node +# pd_master_ip: the ip of the pd master +# sh pd_prefill.sh +export host=$1 +export pd_master_ip=$2 +nvidia-cuda-mps-control -d +MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \ +--model_dir /path/DeepSeek-R1 \ +--run_mode "prefill" \ +--tp 8 \ +--dp 8 \ +--host $host \ +--port 8019 \ +--nccl_port 2732 \ +--enable_fa3 \ +--disable_cudagraph \ +--pd_master_ip $pd_master_ip \ +--pd_master_port 60011 +# if you want to enable microbatch overlap, you can uncomment the following lines +#--enable_prefill_microbatch_overlap \ No newline at end of file diff --git a/test/test.sh b/test/test.sh deleted file mode 100644 index 8f3882386..000000000 --- a/test/test.sh +++ /dev/null @@ -1,107 +0,0 @@ -# pd start -python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat --run_mode "pd_master" --host `hostname -i` --port 60011 - -nvidia-cuda-mps-control -d -CUDA_VISIBLE_DEVICES=0,1,2,3 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat \ ---run_mode "prefill" \ ---host `hostname -i` \ ---port 8019 \ ---tp 4 \ ---nccl_port 2732 \ ---max_total_token_num 400000 \ ---tokenizer_mode fast \ ---pd_master_ip `hostname -i` \ ---pd_master_port 60011 \ ---max_req_total_len 16000 \ ---running_max_req_size 128 \ ---disable_cudagraph - -nvidia-cuda-mps-control -d -CUDA_VISIBLE_DEVICES=4,5,6,7 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat \ ---run_mode "decode" \ ---host `hostname -i` \ ---port 8121 \ ---nccl_port 12322 \ ---tp 4 \ ---max_total_token_num 400000 \ ---graph_max_len_in_batch 2048 \ ---graph_max_batch_size 16 \ ---tokenizer_mode fast \ ---pd_master_ip `hostname -i` \ ---pd_master_port 60011 - -# pd start1 -python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat --run_mode "pd_master" --host `hostname -i` --port 60011 - -nvidia-cuda-mps-control -d -CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat \ ---run_mode "prefill" \ ---host `hostname -i` \ ---port 8019 \ ---tp 1 \ ---nccl_port 2732 \ ---max_total_token_num 40000 \ ---tokenizer_mode fast \ ---pd_master_ip `hostname -i` \ ---pd_master_port 60011 \ ---max_req_total_len 16000 \ ---running_max_req_size 128 \ ---disable_cudagraph - -nvidia-cuda-mps-control -d -CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /dev/shm/llama2-7b-chat \ ---run_mode "decode" \ ---host `hostname -i` \ ---port 8121 \ ---nccl_port 12322 \ ---tp 1 \ ---max_total_token_num 40000 \ ---graph_max_len_in_batch 2048 \ ---graph_max_batch_size 16 \ ---tokenizer_mode fast \ ---pd_master_ip `hostname -i` \ ---pd_master_port 60011 - - -# normal start -LOADWORKER=8 python -m lightllm.server.api_server --port 8018 --model_dir /dev/shm/llama2-7b-chat --tp 2 --graph_max_batch_size 16 - - -# 多 pd_master 节点部署实列 -python -m lightllm.server.api_server --run_mode "config_server" --config_server_host 10.120.114.74 --config_server_port 60088 - -python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60011 --config_server_host 10.120.114.74 --config_server_port 60088 - -python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60012 --config_server_host 10.120.114.74 --config_server_port 60088 - - -nvidia-cuda-mps-control -d -CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \ ---run_mode "prefill" \ ---host 10.120.178.74 \ ---port 8019 \ ---tp 1 \ ---nccl_port 2732 \ ---max_total_token_num 40000 \ ---tokenizer_mode fast \ ---max_req_total_len 16000 \ ---running_max_req_size 128 \ ---disable_cudagraph \ ---config_server_host 10.120.114.74 \ ---config_server_port 60088 - -CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \ ---run_mode "decode" \ ---host 10.120.178.74 \ ---port 8121 \ ---nccl_port 12322 \ ---tp 1 \ ---max_total_token_num 40000 \ ---graph_max_len_in_batch 2048 \ ---graph_max_batch_size 16 \ ---tokenizer_mode fast \ ---config_server_host 10.120.114.74 \ ---config_server_port 60088 - - - diff --git a/test/test_accuracy.py b/test/test_accuracy.py deleted file mode 100644 index 5ea825356..000000000 --- a/test/test_accuracy.py +++ /dev/null @@ -1,114 +0,0 @@ -import argparse -import subprocess -import time -import os -import requests -import sys -import json - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--tp", type=int, required=True, help="Number of GPUs to use.") - parser.add_argument("--model_dir", type=str, required=True, help="Directory of the model.") - return parser.parse_args() - - -def start_server(tp, model_dir): - cmd = [ - "python", - "-m", - "lightllm.server.api_server", - "--tp", - str(tp), - "--model_dir", - model_dir, - "--data_type", - "fp16", - "--mode", - "triton_gqa_flashdecoding", - "--trust_remote_code", - "--tokenizer_mode", - "fast", - "--host", - "0.0.0.0", - "--port", - "8080", - ] - process = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr) - return process - - -def check_health(): - health_url = "http://localhost:8080/health" - try: - r = requests.get(health_url, timeout=2) - return r.status_code == 200 - except Exception: - return False - - -def send_prompts(prompts, output_file): - for prompt in prompts: - while not check_health(): - time.sleep(1) - - request_data = { - "inputs": prompt, - "parameters": {"max_new_tokens": 1024, "frequency_penalty": 1, "do_sample": False}, - "multimodal_params": {}, - } - - try: - r = requests.post("http://localhost:8080/generate", json=request_data, timeout=10) - response_json = json.loads(r.text) - generated_text = ( - response_json["generated_text"][0] if "generated_text" in response_json else "No generated_text." - ) - except Exception as e: - generated_text = f"ERROR: {str(e)}" - - with open(output_file, "a", encoding="utf-8") as f: - f.write(f"===== prompt: {prompt} =====\n") - f.write(f"{generated_text}\n\n") - - print(f"===================Ouput saved in {output_file}===========================") - - -def main(): - # args - args = parse_args() - tp = args.tp - model_dir = args.model_dir - - # output_file - output_file = "test_results.txt" - - if os.path.exists(output_file): - os.remove(output_file) - - # start server - process = start_server(tp, model_dir) - - # prompts - prompts = [ - "What is the machine learning?", - "1+1等于几", - "What role does attention play in transformer architectures?", - "西红柿炒鸡蛋怎么做?", - "Describe the concept of overfitting and underfitting.", - "CPU和GPU的区别是什么?", - "What is the role of a loss function in machine learning?", - ] - - send_prompts(prompts, output_file) - - # shutdown server - process.terminate() - process.wait() - - -if __name__ == "__main__": - main() - -# python test_accuracy.py --tp 2 --model_dir /xx/xx diff --git a/test/test.jpg b/test/test_api/test.jpg similarity index 100% rename from test/test.jpg rename to test/test_api/test.jpg diff --git a/test/test_server.py b/test/test_api/test_generate_api.py similarity index 100% rename from test/test_server.py rename to test/test_api/test_generate_api.py diff --git a/test/test_multimodal_server.py b/test/test_api/test_multimodal_api.py similarity index 100% rename from test/test_multimodal_server.py rename to test/test_api/test_multimodal_api.py diff --git a/test/test_api/test_openai_api.py b/test/test_api/test_openai_api.py new file mode 100644 index 000000000..6d98dadbe --- /dev/null +++ b/test/test_api/test_openai_api.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +LightLLM OpenAI API test cases + +python test_openai_api.py +""" + +import requests +import json +import time +from typing import Dict, List, Any, Optional + + +class LightLLMClient: + """LightLLM OpenAI API test cases""" + + def __init__(self, base_url: str = "http://localhost:8000", model_name: str = "your_model_name"): + self.base_url = base_url + self.model_name = model_name + self.headers = {"Content-Type": "application/json"} + self.conversation_history = [] + + def simple_chat(self, message: str, **kwargs) -> Dict[str, Any]: + """简单对话""" + data = { + "model": self.model_name, + "messages": [{"role": "user", "content": message}], + "temperature": kwargs.get("temperature", 0.7), + "max_tokens": kwargs.get("max_tokens", 1000), + **kwargs, + } + + response = requests.post(f"{self.base_url}/v1/chat/completions", headers=self.headers, json=data) + + if response.status_code == 200: + return response.json() + else: + raise Exception(f"API调用失败: {response.status_code} - {response.text}") + + def stream_chat(self, message: str, **kwargs): + data = { + "model": self.model_name, + "messages": [{"role": "user", "content": message}], + "stream": True, + "temperature": kwargs.get("temperature", 0.7), + "max_tokens": kwargs.get("max_tokens", 1000), + **kwargs, + } + + response = requests.post(f"{self.base_url}/v1/chat/completions", headers=self.headers, json=data, stream=True) + + if response.status_code == 200: + for line in response.iter_lines(): + if line: + line = line.decode("utf-8") + if line.startswith("data: "): + data_str = line[6:] + if data_str == "[DONE]": + break + try: + chunk = json.loads(data_str) + if chunk["choices"][0]["delta"].get("content"): + yield chunk["choices"][0]["delta"]["content"] + except json.JSONDecodeError: + continue + else: + raise Exception(f"API调用失败: {response.status_code} - {response.text}") + + def function_call(self, message: str, tools: List[Dict], tool_choice: str = "auto", **kwargs) -> Dict[str, Any]: + """Function calling""" + data = { + "model": self.model_name, + "messages": [{"role": "user", "content": message}], + "tools": tools, + "tool_choice": tool_choice, + "temperature": kwargs.get("temperature", 0.7), + "max_tokens": kwargs.get("max_tokens", 1000), + **kwargs, + } + + response = requests.post(f"{self.base_url}/v1/chat/completions", headers=self.headers, json=data) + + if response.status_code == 200: + return response.json() + else: + raise Exception(f"API调用失败: {response.status_code} - {response.text}") + + def stream_function_call(self, message: str, tools: List[Dict], tool_choice: str = "auto", **kwargs): + """stream Function calling""" + data = { + "model": self.model_name, + "messages": [{"role": "user", "content": message}], + "tools": tools, + "tool_choice": tool_choice, + "stream": True, + "temperature": kwargs.get("temperature", 0.7), + "max_tokens": kwargs.get("max_tokens", 1000), + **kwargs, + } + + response = requests.post(f"{self.base_url}/v1/chat/completions", headers=self.headers, json=data, stream=True) + + if response.status_code == 200: + content_buffer = "" + tool_calls_buffer = [] + + for line in response.iter_lines(): + if line: + line = line.decode("utf-8") + if line.startswith("data: "): + data_str = line[6:] + if data_str == "[DONE]": + break + try: + chunk = json.loads(data_str) + delta = chunk["choices"][0]["delta"] + + # 处理内容 + if delta.get("content"): + content_buffer += delta["content"] + yield {"type": "content", "data": delta["content"]} + + # 处理函数调用 + if delta.get("tool_calls"): + for tool_call in delta["tool_calls"]: + tool_calls_buffer.append(tool_call) + yield {"type": "tool_call", "data": tool_call} + + except json.JSONDecodeError: + continue + else: + raise Exception(f"API调用失败: {response.status_code} - {response.text}") + + +def test_simple_chat(): + client = LightLLMClient() + + try: + result = client.simple_chat("你好,请介绍一下你自己") + print("用户: 你好,请介绍一下你自己") + print("助手:", result["choices"][0]["message"]["content"]) + print() + except Exception as e: + print(f"错误: {e}") + print("请确保 LightLLM 服务已启动,并检查配置") + + +def test_stream_chat(): + client = LightLLMClient() + + try: + print("用户: 请写一个关于人工智能的短文") + print("助手: ", end="", flush=True) + + for chunk in client.stream_chat("请写一个关于人工智能的短文"): + print(chunk, end="", flush=True) + print("\n") + except Exception as e: + print(f"错误: {e}") + + +def test_function_call(): + client = LightLLMClient() + + # 定义函数 + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "获取指定城市的天气信息", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "城市名称,例如:北京、上海"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "温度单位"}, + }, + "required": ["city"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "calculate", + "description": "执行数学计算", + "parameters": { + "type": "object", + "properties": {"expression": {"type": "string", "description": "数学表达式,例如:2+3*4"}}, + "required": ["expression"], + }, + }, + }, + ] + + try: + # 测试天气查询 + print("用户: 北京今天天气怎么样?") + result = client.function_call("北京今天天气怎么样?", tools) + message = result["choices"][0]["message"] + + if message.get("tool_calls"): + print("助手决定调用函数:") + for tool_call in message["tool_calls"]: + print(f" 函数名: {tool_call['function']['name']}") + print(f" 参数: {tool_call['function']['arguments']}") + else: + print("助手:", message["content"]) + print() + + # 测试数学计算 + print("用户: 请计算 25 * 4 + 10 的结果") + result = client.function_call("请计算 25 * 4 + 10 的结果", tools) + message = result["choices"][0]["message"] + + if message.get("tool_calls"): + print("助手决定调用函数:") + for tool_call in message["tool_calls"]: + print(f" 函数名: {tool_call['function']['name']}") + print(f" 参数: {tool_call['function']['arguments']}") + else: + print("助手:", message["content"]) + print() + + except Exception as e: + print(f"错误: {e}") + + +def test_stream_function_call(): + + client = LightLLMClient() + + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "获取指定城市的天气信息", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "城市名称"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["city"], + }, + }, + } + ] + + try: + print("用户: 上海今天天气怎么样?") + print("助手: ", end="", flush=True) + + for chunk in client.stream_function_call("上海今天天气怎么样?", tools): + if chunk["type"] == "content": + print(chunk["data"], end="", flush=True) + elif chunk["type"] == "tool_call": + print(f"\n[函数调用: {chunk['data']['function']['name']}]") + if chunk["data"]["function"].get("arguments"): + print(f"参数: {chunk['data']['function']['arguments']}") + print("\n") + + except Exception as e: + print(f"错误: {e}") + + +def main(): + test_simple_chat() + test_stream_chat() + test_function_call() + test_stream_function_call() + + +if __name__ == "__main__": + main() diff --git a/test/test_constraint_server.py b/test/test_constraint_server.py deleted file mode 100644 index 46802239f..000000000 --- a/test/test_constraint_server.py +++ /dev/null @@ -1,66 +0,0 @@ -import time -import requests -import json -import threading - -""" -python -m lightllm.server.api_server --model_dir /Meta-Llama-3-8B-Instruct \ - --host 0.0.0.0 \ - --port 8017 \ - --tp 1 \ - --max_total_token_num 100000 \ - --simple_constraint_mode -""" - - -class RequestThread(threading.Thread): - def __init__(self, url, headers, data): - threading.Thread.__init__(self) - self.url = url - self.headers = headers - self.data = data - - def run(self): - response = requests.post(self.url, headers=self.headers, data=json.dumps(self.data)) - if response.status_code == 200: - print(response.json()) - else: - print("Error:", response.status_code, response.text) - - -url = "http://localhost:8017/generate" -headers = {"Content-Type": "application/json"} - -for i in range(1): - data = { - "inputs": "(100+1+3)*2=", - # 'temperature': 0.1, - "parameters": {"do_sample": False, "regular_constraint": r"-?\d+"}, - } - thread = RequestThread(url, headers, data) - thread.start() - -time.sleep(2) - -for i in range(20): - data = { - "inputs": "Are dog a man? ", - "parameters": { - "do_sample": False, - "ignore_eos": True, - "max_new_tokens": 200, - "regular_constraint": r"(Yes|No) Reason is [a-zA-Z\s]+", - }, - } - thread = RequestThread(url, headers, data) - thread.start() - -time.sleep(10) - -for i in range(20): - data = { - "inputs": "Are dog a man? ", - "parameters": {"do_sample": False, "ignore_eos": True, "max_new_tokens": 200, "allowed_token_ids": [2, 3]}, - } - thread = RequestThread(url, headers, data) - thread.start() diff --git a/test/test_function_call_api.py b/test/test_function_call_api.py deleted file mode 100644 index 584b41d84..000000000 --- a/test/test_function_call_api.py +++ /dev/null @@ -1,72 +0,0 @@ -import time -import requests -import json -import threading - - -class RequestThread(threading.Thread): - def __init__(self, url, headers, data): - threading.Thread.__init__(self) - self.url = url - self.headers = headers - self.data = data - - def run(self): - response = requests.post(self.url, headers=self.headers, data=json.dumps(self.data)) - if response.status_code == 200: - print(response.json()) - else: - print("Error:", response.status_code, response.text) - - -openai_url = "http://localhost:8888/v1/chat/completions" -headers = {"Content-Type": "application/json"} - -# Test OpenAI Tool Call API -messages = [ - { - "role": "user", - "content": "What's the weather like in Boston today? " - "Output a reasoning before act, then use the tools to help you.", - } -] -tools = [ - { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": "The city to find the weather for, e.g. 'San Francisco'", - }, - "state": { - "type": "string", - "description": "the two-letter abbreviation for the state that the city is" - " in, e.g. 'CA' which would mean 'California'", - }, - "unit": { - "type": "string", - "description": "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"], - }, - }, - "required": ["city", "state", "unit"], - }, - }, - } -] - -for i in range(1): - data = { - "model": "qwen25", - "messages": messages, - "tools": tools, - "do_sample": False, - "max_tokens": 1024, - } - thread = RequestThread(openai_url, headers, data) - thread.start()