|
1 | | -使用下面命令下载并运行推理服务 |
| 1 | +# Start inference |
2 | 2 |
|
| 3 | +## For RK3588 |
| 4 | + |
| 5 | +```bash |
| 6 | +docker run -it --name deepseek-r1-1.5b-fp16 --privileged --net=host --device /dev/dri --device /dev/dma_heap --device /dev/rknpu --device /dev/mali0 -v /dev:/dev ghcr.io/lj-hao/rk3588-deepseek-r1-distill-qwen:1.5b-fp16-latest |
| 7 | +``` |
| 8 | + |
| 9 | +## For RK3576 |
| 10 | + |
| 11 | +```bash |
| 12 | +docker run -it --name deepseek-r1-1.5b-fp16 --privileged --net=host --device /dev/dri --device /dev/dma_heap --device /dev/rknpu --device /dev/mali0 -v /dev:/dev ghcr.io/lj-hao/rk3576-deepseek-r1-distill-qwen:1.5b-fp16-latest |
| 13 | +``` |
| 14 | + |
| 15 | +>Note: When you start the service, you can access `http://localhost:8080/docs` and `http://localhost:8080/redoc` to view the documentation. |
| 16 | +# Test API: |
| 17 | + |
| 18 | +## Non-streaming response: |
| 19 | + |
| 20 | +```bash |
| 21 | +curl http://127.0.0.1:8080/v1/chat/completions \ |
| 22 | + -H "Content-Type: application/json" \ |
| 23 | + -d '{ |
| 24 | + "model": "rkllm-model", |
| 25 | + "messages": [ |
| 26 | + {"role": "user", "content": "Where is the capital of China?"} |
| 27 | + ], |
| 28 | + "temperature": 1, |
| 29 | + "max_tokens": 512, |
| 30 | + "top_k": 1, |
| 31 | + "stream": false |
| 32 | + }' |
3 | 33 | ``` |
4 | | -docker pull ghcr.io/lj-hao/rk3588-deepseek-r1-distill-qwen:1.5b-w8a8-latest |
5 | 34 |
|
6 | | -docker run -it --name deepseek-r1-1.5b --privileged --net=host --device /dev/dri --device /dev/dma_heap --device /dev/rknpu --device /dev/mali0 -v /dev:/dev ghcr.io/lj-hao/rk3588-deepseek-r1-distill-qwen:1.5b-w8a8-latest |
| 35 | +## Streaming response: |
7 | 36 |
|
| 37 | +```bash |
| 38 | +curl -N http://127.0.0.1:8080/v1/chat/completions \ |
| 39 | + -H "Content-Type: application/json" \ |
| 40 | + -d '{ |
| 41 | + "model": "rkllm-model", |
| 42 | + "messages": [ |
| 43 | + {"role": "user", "content": "Where is the capital of China?"} |
| 44 | + ], |
| 45 | + "temperature": 2, |
| 46 | + "max_tokens": 512, |
| 47 | + "top_k": 1, |
| 48 | + "stream": true |
| 49 | + }' |
8 | 50 | ``` |
9 | 51 |
|
10 | | -使用下面命令来测试: |
| 52 | +# Use OpenAI API to test |
| 53 | + |
| 54 | +## Non-streaming response: |
| 55 | + |
| 56 | +```python |
| 57 | +import openai |
| 58 | + |
| 59 | +# Configure the OpenAI client to use your local server |
| 60 | +client = openai.OpenAI( |
| 61 | + base_url="http://localhost:8080/v1", # Point to your local server |
| 62 | + api_key="dummy-key" # The API key can be anything for this local server |
| 63 | +) |
| 64 | + |
| 65 | +# Test the API |
| 66 | +response = client.chat.completions.create( |
| 67 | + model="rkllm-model", |
| 68 | + messages=[ |
| 69 | + {"role": "system", "content": "You are a helpful assistant."}, |
| 70 | + {"role": "user", "content": "Where is the capital of China?"} |
| 71 | + ], |
| 72 | + temperature=0.7, |
| 73 | + max_tokens=512 |
| 74 | +) |
11 | 75 |
|
| 76 | +print(response.choices[0].message.content) |
12 | 77 | ``` |
13 | | -curl http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ |
14 | | -"messages": [ |
15 | | -{"role": "user", "content": "请解释一下相对论的基本概念。"} |
16 | | -], |
17 | | -"n_keep": 0, |
18 | | -"cache_prompt": false, |
19 | | -"id_slot": 0, |
20 | | -"n_predict": 512, |
21 | | -"stream": true |
22 | | -}' |
23 | | -``` |
| 78 | + |
| 79 | +## Streaming response: |
| 80 | + |
| 81 | +```python |
| 82 | +import openai |
| 83 | + |
| 84 | +# Configure the OpenAI client to use your local server |
| 85 | +client = openai.OpenAI( |
| 86 | + base_url="http://localhost:8080/v1", # Point to your local server |
| 87 | + api_key="dummy-key" # The API key can be anything for this local server |
| 88 | +) |
| 89 | + |
| 90 | +# Test the API with streaming |
| 91 | +response_stream = client.chat.completions.create( |
| 92 | + model="rkllm-model", |
| 93 | + messages=[ |
| 94 | + {"role": "system", "content": "You are a helpful assistant."}, |
| 95 | + {"role": "user", "content": "Where is the capital of China?"} |
| 96 | + ], |
| 97 | + temperature=0.7, |
| 98 | + max_tokens=512, |
| 99 | + stream=True # Enable streaming |
| 100 | +) |
| 101 | + |
| 102 | +# Process the streaming response |
| 103 | +for chunk in response_stream: |
| 104 | + if chunk.choices[0].delta.content is not None: |
| 105 | + print(chunk.choices[0].delta.content, end="", flush=True) |
| 106 | +``` |
| 107 | + |
| 108 | +# Speed test |
| 109 | + |
| 110 | +> Note: A rough estimate of a model's inference speed includes both TTFT and TPOT. |
| 111 | +
|
| 112 | +```bash |
| 113 | +python -m venv .env && source .env/bin/activate |
| 114 | +pip install requests |
| 115 | +python test_inference_speed.py |
| 116 | +``` |
| 117 | + |
| 118 | +> Note: You can use `python test_inference_speed.py --help` to view the help function. |
0 commit comments