Seeed-Projects
diff --git a/‎.gitignore‎
Lines changed: 56 additions & 0 deletions b/‎.gitignore‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 110 additions & 15 deletions b/‎README.md‎
Lines changed: 110 additions & 15 deletions
diff --git a/‎docker/RK3576/LLM/DeepSeek-R1-Distill-Qwen.dockerfile‎
Lines changed: 3 additions & 3 deletions b/‎docker/RK3576/LLM/DeepSeek-R1-Distill-Qwen.dockerfile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docker/RK3588/LLM/DeepSeek-R1-Distill-Qwen.dockerfile‎
Lines changed: 3 additions & 3 deletions b/‎docker/RK3588/LLM/DeepSeek-R1-Distill-Qwen.dockerfile‎
Lines changed: 3 additions & 3 deletions
@@ -0,0 +1,56 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg-info/
+dist/
+build/
+
+# Virtual environments
+venv/
+env/
+.venv/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Logs and databases
+*.log
+*.sqlite
+*.db
+
+# Environment variables
+.env
+.env.local
+
+# Jupyter
+.ipynb_checkpoints/
+
+# PyTest
+.pytest_cache/
+.coverage
+
+# MyPy
+.mypy_cache/
+
+# Large files / models
+*.pth
+*.ckpt
+*.bin
+*.onnx
+*.h5
+*.pt
+
+# Project specific
+*.rkllm
+config.py
+secrets.py
@@ -1,23 +1,118 @@
-使用下面命令下载并运行推理服务
+# Start inference
 
+## For RK3588
+
+```bash
+docker run -it --name deepseek-r1-1.5b-fp16   --privileged    --net=host    --device /dev/dri    --device /dev/dma_heap    --device /dev/rknpu    --device /dev/mali0    -v /dev:/dev      ghcr.io/lj-hao/rk3588-deepseek-r1-distill-qwen:1.5b-fp16-latest
+```
+
+## For RK3576
+
+```bash
+docker run -it --name deepseek-r1-1.5b-fp16   --privileged    --net=host    --device /dev/dri    --device /dev/dma_heap    --device /dev/rknpu    --device /dev/mali0    -v /dev:/dev      ghcr.io/lj-hao/rk3576-deepseek-r1-distill-qwen:1.5b-fp16-latest
+```
+
+>Note: When you start the service, you can access `http://localhost:8080/docs` and `http://localhost:8080/redoc` to view the documentation.
+# Test API：
+
+## Non-streaming response：
+
+```bash
+curl http://127.0.0.1:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "rkllm-model",
+    "messages": [
+      {"role": "user", "content": "Where is the capital of China？"}
+    ],
+    "temperature": 1,
+    "max_tokens": 512,
+    "top_k": 1,
+    "stream": false
+  }'
 ```
-docker pull ghcr.io/lj-hao/rk3588-deepseek-r1-distill-qwen:1.5b-w8a8-latest
 
-docker run -it --name deepseek-r1-1.5b    --privileged    --net=host    --device /dev/dri    --device /dev/dma_heap    --device /dev/rknpu    --device /dev/mali0    -v /dev:/dev      ghcr.io/lj-hao/rk3588-deepseek-r1-distill-qwen:1.5b-w8a8-latest
+## Streaming response:
 
+```bash
+curl -N http://127.0.0.1:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "rkllm-model",
+    "messages": [
+      {"role": "user", "content": "Where is the capital of China？"}
+    ],
+    "temperature": 2,
+    "max_tokens": 512,
+    "top_k": 1,
+    "stream": true
+  }'
 ```
 
-使用下面命令来测试：
+# Use OpenAI API to test
+
+## Non-streaming response：
+
+```python
+import openai
+
+# Configure the OpenAI client to use your local server
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1",  # Point to your local server
+    api_key="dummy-key"  # The API key can be anything for this local server
+)
+
+# Test the API
+response = client.chat.completions.create(
+    model="rkllm-model",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Where is the capital of China？"}
+    ],
+    temperature=0.7,
+    max_tokens=512
+)
 
+print(response.choices[0].message.content)
 ```
-curl http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-"messages": [
-{"role": "user", "content": "请解释一下相对论的基本概念。"}
-],
-"n_keep": 0,
-"cache_prompt": false,
-"id_slot": 0,
-"n_predict": 512,
-"stream": true
-}'
-```
+
+## Streaming response:
+
+```python
+import openai
+
+# Configure the OpenAI client to use your local server
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1",  # Point to your local server
+    api_key="dummy-key"  # The API key can be anything for this local server
+)
+
+# Test the API with streaming
+response_stream = client.chat.completions.create(
+    model="rkllm-model",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Where is the capital of China？"}
+    ],
+    temperature=0.7,
+    max_tokens=512,
+    stream=True  # Enable streaming
+)
+
+# Process the streaming response
+for chunk in response_stream:
+    if chunk.choices[0].delta.content is not None:
+        print(chunk.choices[0].delta.content, end="", flush=True)
+```
+
+# Speed test
+
+> Note: A rough estimate of a model's inference speed includes both TTFT and TPOT.
+
+```bash
+python -m venv .env && source .env/bin/activate
+pip install requests
+python test_inference_speed.py
+```
+
+> Note: You can use `python test_inference_speed.py --help` to view the help function.
@@ -9,7 +9,7 @@ WORKDIR /app
 RUN mkdir -p /app/models
 
 # 安装依赖
-COPY ./src/flask_server_requirements_llm.txt /app/requirements.txt
+COPY ./src/fastapi_server_requirements_llm.txt /app/requirements.txt
 RUN pip install --no-cache-dir -r /app/requirements.txt
 
 # 拷贝运行时库和工具
@@ -27,11 +27,11 @@ RUN if [ -n "${MODEL_URL}" ]; then \
     wget --progress=dot:giga "${MODEL_URL}" -O "/app/models/${MODEL_FILE}"; \
     fi
 
-COPY ./src/flask_server_llm.py /app/
+COPY ./src/fastapi_server_llm.py /app/
 
 # 将 ARG 转为 ENV，这样 CMD 才能读取到
 ENV RKLLM_MODEL_PATH=/app/models/${MODEL_FILE}
 
 EXPOSE 8080
 
-CMD ["sh", "-c", "python3 /app/flask_server_llm.py --rkllm_model_path ${RKLLM_MODEL_PATH} --target_platform rk3576"]
+CMD ["sh", "-c", "python3 /app/fastapi_server_llm.py --rkllm_model_path ${RKLLM_MODEL_PATH} --target_platform rk3576"]
@@ -9,7 +9,7 @@ WORKDIR /app
 RUN mkdir -p /app/models
 
 # 安装依赖
-COPY ./src/flask_server_requirements_llm.txt /app/requirements.txt
+COPY ./src/fastapi_server_requirements_llm.txt /app/requirements.txt
 RUN pip install --no-cache-dir -r /app/requirements.txt
 
 # 拷贝运行时库和工具
@@ -27,11 +27,11 @@ RUN if [ -n "${MODEL_URL}" ]; then \
     wget --progress=dot:giga "${MODEL_URL}" -O "/app/models/${MODEL_FILE}"; \
     fi
 
-COPY ./src/flask_server_llm.py /app/
+COPY ./src/fastapi_server_llm.py /app/
 
 # 将 ARG 转为 ENV，这样 CMD 才能读取到
 ENV RKLLM_MODEL_PATH=/app/models/${MODEL_FILE}
 
 EXPOSE 8080
 
-CMD ["sh", "-c", "python3 /app/flask_server_llm.py --rkllm_model_path ${RKLLM_MODEL_PATH} --target_platform rk3588"]
+CMD ["sh", "-c", "python3 /app/fastapi_server_llm.py --rkllm_model_path ${RKLLM_MODEL_PATH} --target_platform rk3588"]