Skip to content

Commit 32c3864

Browse files
authored
Merge pull request #3 from LJ-Hao/main
update: streaming response and Non-streaming response and OpenAI api
2 parents ab19801 + 12bdc94 commit 32c3864

9 files changed

+1944
-356
lines changed

.gitignore

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Python
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
*.so
6+
*.egg-info/
7+
dist/
8+
build/
9+
10+
# Virtual environments
11+
venv/
12+
env/
13+
.venv/
14+
15+
# IDE
16+
.vscode/
17+
.idea/
18+
*.swp
19+
*.swo
20+
*~
21+
22+
# OS
23+
.DS_Store
24+
Thumbs.db
25+
26+
# Logs and databases
27+
*.log
28+
*.sqlite
29+
*.db
30+
31+
# Environment variables
32+
.env
33+
.env.local
34+
35+
# Jupyter
36+
.ipynb_checkpoints/
37+
38+
# PyTest
39+
.pytest_cache/
40+
.coverage
41+
42+
# MyPy
43+
.mypy_cache/
44+
45+
# Large files / models
46+
*.pth
47+
*.ckpt
48+
*.bin
49+
*.onnx
50+
*.h5
51+
*.pt
52+
53+
# Project specific
54+
*.rkllm
55+
config.py
56+
secrets.py

README.md

Lines changed: 110 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,118 @@
1-
使用下面命令下载并运行推理服务
1+
# Start inference
22

3+
## For RK3588
4+
5+
```bash
6+
docker run -it --name deepseek-r1-1.5b-fp16 --privileged --net=host --device /dev/dri --device /dev/dma_heap --device /dev/rknpu --device /dev/mali0 -v /dev:/dev ghcr.io/lj-hao/rk3588-deepseek-r1-distill-qwen:1.5b-fp16-latest
7+
```
8+
9+
## For RK3576
10+
11+
```bash
12+
docker run -it --name deepseek-r1-1.5b-fp16 --privileged --net=host --device /dev/dri --device /dev/dma_heap --device /dev/rknpu --device /dev/mali0 -v /dev:/dev ghcr.io/lj-hao/rk3576-deepseek-r1-distill-qwen:1.5b-fp16-latest
13+
```
14+
15+
>Note: When you start the service, you can access `http://localhost:8080/docs` and `http://localhost:8080/redoc` to view the documentation.
16+
# Test API:
17+
18+
## Non-streaming response:
19+
20+
```bash
21+
curl http://127.0.0.1:8080/v1/chat/completions \
22+
-H "Content-Type: application/json" \
23+
-d '{
24+
"model": "rkllm-model",
25+
"messages": [
26+
{"role": "user", "content": "Where is the capital of China?"}
27+
],
28+
"temperature": 1,
29+
"max_tokens": 512,
30+
"top_k": 1,
31+
"stream": false
32+
}'
333
```
4-
docker pull ghcr.io/lj-hao/rk3588-deepseek-r1-distill-qwen:1.5b-w8a8-latest
534

6-
docker run -it --name deepseek-r1-1.5b --privileged --net=host --device /dev/dri --device /dev/dma_heap --device /dev/rknpu --device /dev/mali0 -v /dev:/dev ghcr.io/lj-hao/rk3588-deepseek-r1-distill-qwen:1.5b-w8a8-latest
35+
## Streaming response:
736

37+
```bash
38+
curl -N http://127.0.0.1:8080/v1/chat/completions \
39+
-H "Content-Type: application/json" \
40+
-d '{
41+
"model": "rkllm-model",
42+
"messages": [
43+
{"role": "user", "content": "Where is the capital of China?"}
44+
],
45+
"temperature": 2,
46+
"max_tokens": 512,
47+
"top_k": 1,
48+
"stream": true
49+
}'
850
```
951

10-
使用下面命令来测试:
52+
# Use OpenAI API to test
53+
54+
## Non-streaming response:
55+
56+
```python
57+
import openai
58+
59+
# Configure the OpenAI client to use your local server
60+
client = openai.OpenAI(
61+
base_url="http://localhost:8080/v1", # Point to your local server
62+
api_key="dummy-key" # The API key can be anything for this local server
63+
)
64+
65+
# Test the API
66+
response = client.chat.completions.create(
67+
model="rkllm-model",
68+
messages=[
69+
{"role": "system", "content": "You are a helpful assistant."},
70+
{"role": "user", "content": "Where is the capital of China?"}
71+
],
72+
temperature=0.7,
73+
max_tokens=512
74+
)
1175

76+
print(response.choices[0].message.content)
1277
```
13-
curl http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
14-
"messages": [
15-
{"role": "user", "content": "请解释一下相对论的基本概念。"}
16-
],
17-
"n_keep": 0,
18-
"cache_prompt": false,
19-
"id_slot": 0,
20-
"n_predict": 512,
21-
"stream": true
22-
}'
23-
```
78+
79+
## Streaming response:
80+
81+
```python
82+
import openai
83+
84+
# Configure the OpenAI client to use your local server
85+
client = openai.OpenAI(
86+
base_url="http://localhost:8080/v1", # Point to your local server
87+
api_key="dummy-key" # The API key can be anything for this local server
88+
)
89+
90+
# Test the API with streaming
91+
response_stream = client.chat.completions.create(
92+
model="rkllm-model",
93+
messages=[
94+
{"role": "system", "content": "You are a helpful assistant."},
95+
{"role": "user", "content": "Where is the capital of China?"}
96+
],
97+
temperature=0.7,
98+
max_tokens=512,
99+
stream=True # Enable streaming
100+
)
101+
102+
# Process the streaming response
103+
for chunk in response_stream:
104+
if chunk.choices[0].delta.content is not None:
105+
print(chunk.choices[0].delta.content, end="", flush=True)
106+
```
107+
108+
# Speed test
109+
110+
> Note: A rough estimate of a model's inference speed includes both TTFT and TPOT.
111+
112+
```bash
113+
python -m venv .env && source .env/bin/activate
114+
pip install requests
115+
python test_inference_speed.py
116+
```
117+
118+
> Note: You can use `python test_inference_speed.py --help` to view the help function.

docker/RK3576/LLM/DeepSeek-R1-Distill-Qwen.dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ WORKDIR /app
99
RUN mkdir -p /app/models
1010

1111
# 安装依赖
12-
COPY ./src/flask_server_requirements_llm.txt /app/requirements.txt
12+
COPY ./src/fastapi_server_requirements_llm.txt /app/requirements.txt
1313
RUN pip install --no-cache-dir -r /app/requirements.txt
1414

1515
# 拷贝运行时库和工具
@@ -27,11 +27,11 @@ RUN if [ -n "${MODEL_URL}" ]; then \
2727
wget --progress=dot:giga "${MODEL_URL}" -O "/app/models/${MODEL_FILE}"; \
2828
fi
2929

30-
COPY ./src/flask_server_llm.py /app/
30+
COPY ./src/fastapi_server_llm.py /app/
3131

3232
# 将 ARG 转为 ENV,这样 CMD 才能读取到
3333
ENV RKLLM_MODEL_PATH=/app/models/${MODEL_FILE}
3434

3535
EXPOSE 8080
3636

37-
CMD ["sh", "-c", "python3 /app/flask_server_llm.py --rkllm_model_path ${RKLLM_MODEL_PATH} --target_platform rk3576"]
37+
CMD ["sh", "-c", "python3 /app/fastapi_server_llm.py --rkllm_model_path ${RKLLM_MODEL_PATH} --target_platform rk3576"]

docker/RK3588/LLM/DeepSeek-R1-Distill-Qwen.dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ WORKDIR /app
99
RUN mkdir -p /app/models
1010

1111
# 安装依赖
12-
COPY ./src/flask_server_requirements_llm.txt /app/requirements.txt
12+
COPY ./src/fastapi_server_requirements_llm.txt /app/requirements.txt
1313
RUN pip install --no-cache-dir -r /app/requirements.txt
1414

1515
# 拷贝运行时库和工具
@@ -27,11 +27,11 @@ RUN if [ -n "${MODEL_URL}" ]; then \
2727
wget --progress=dot:giga "${MODEL_URL}" -O "/app/models/${MODEL_FILE}"; \
2828
fi
2929

30-
COPY ./src/flask_server_llm.py /app/
30+
COPY ./src/fastapi_server_llm.py /app/
3131

3232
# 将 ARG 转为 ENV,这样 CMD 才能读取到
3333
ENV RKLLM_MODEL_PATH=/app/models/${MODEL_FILE}
3434

3535
EXPOSE 8080
3636

37-
CMD ["sh", "-c", "python3 /app/flask_server_llm.py --rkllm_model_path ${RKLLM_MODEL_PATH} --target_platform rk3588"]
37+
CMD ["sh", "-c", "python3 /app/fastapi_server_llm.py --rkllm_model_path ${RKLLM_MODEL_PATH} --target_platform rk3588"]

0 commit comments

Comments
 (0)