Skip to content

Commit ff7fa9c

Browse files
author
none
committed
add draft test.py
1 parent 937af93 commit ff7fa9c

File tree

1 file changed

+150
-0
lines changed

1 file changed

+150
-0
lines changed

test.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import torch
2+
import numpy as np
3+
from torch.profiler import profile, record_function, ProfilerActivity
4+
5+
data_o = torch.zeros((128 * 1024), dtype=torch.int32, device="cuda")
6+
in_data = list(range(0, 1000))
7+
in_datas = [list(range(0, 1000)) for _ in range(100)]
8+
import time
9+
10+
cpu_tensor = torch.zeros((128 * 1024), dtype=torch.int32, device="cpu", pin_memory=False)
11+
pin_mem_tensor = torch.zeros((128 * 1024), dtype=torch.int32, device="cpu", pin_memory=True)
12+
gpu_tensor = torch.zeros((128 * 1024), dtype=torch.int32, device="cuda")
13+
14+
a = torch.arange(0, 10).cuda()
15+
b = torch.arange(0, 10).cuda()
16+
17+
print((gpu_tensor == 1).dtype)
18+
# max_data = tmp.max()
19+
with profile(
20+
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
21+
record_shapes=False,
22+
profile_memory=False,
23+
on_trace_ready=torch.profiler.tensorboard_trace_handler("./profile/profile.file"),
24+
) as prof:
25+
# gpu_tensor[:] = pin_mem_tensor
26+
# torch.cuda.current_stream().synchronize()
27+
# a = torch.tensor([1,3, 7], device="cuda")
28+
# gpu_tensor[:] = pin_mem_tensor
29+
for _ in range(100):
30+
cpu_tensor.cuda(non_blocking=True)
31+
32+
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=16), flush=True)
33+
34+
35+
# CUDA_VISIBLE_DEVICES=4,5,6,7 LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 4 --dp 1 --diverse_mode | tee log.txt
36+
37+
# CUDA_VISIBLE_DEVICES=4,5,6,7 LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 1 --dp 1 --diverse_mode | tee log.txt 你试试这个
38+
39+
40+
# CUDA_VISIBLE_DEVICES=4,5,6,7 LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 4 --dp 1 --output_constraint_mode xgrammar | tee log.txt
41+
42+
43+
pin_mem_tensor.numpy()[0:10] = list(range(10))
44+
45+
print("ok")
46+
47+
# CUDA_VISIBLE_DEVICES=4,5,6,7 LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 8 --dp 8 | tee log.txt 你试试这个
48+
49+
50+
51+
52+
# LOADWORKER=16 python -m lightllm.server.api_server --model_dir /mtc/DeepSeek-R1 --mtp_draft_model_dir /mtc/DeepSeek-R1-NextN/ --mtp_mode deepseekv3 --mtp_step 1 --enable_fa3 --graph_max_batch_size 64 --tp 8 --port 15001 | tee debug.txt
53+
54+
55+
# LOADWORKER=16 python -m lightllm.server.api_server --model_dir /mtc/DeepSeek-R1 --mtp_draft_model_dir /mtc/DeepSeek-R1-NextN/ --mtp_mode deepseekv3 --mtp_step 1 --enable_fa3 --graph_max_batch_size 64 --tp 8 --port 15001 | tee debug.txt
56+
57+
58+
# LOADWORKER=16 python -m lightllm.server.api_server --model_dir /mtc/DeepSeek-R1 --mtp_draft_model_dir /mtc/DeepSeek-R1-NextN/ --mtp_mode deepseekv3 --mtp_step 1 --enable_fa3 --graph_max_batch_size 64 --tp 8 --port 15001 | tee debug.txt
59+
60+
61+
# MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8019 \
62+
# --model_dir /mtc/DeepSeek-R1 \
63+
# --tp 8 \
64+
# --dp 8 \
65+
# --enable_fa3 \
66+
# --enable_prefill_microbatch_overlap \
67+
# --enable_decode_microbatch_overlap \
68+
# --mem_fraction 0.8 \
69+
# --batch_max_tokens 4096
70+
71+
72+
# MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8019 \
73+
# --model_dir /mtc/DeepSeek-R1 \
74+
# --tp 8 \
75+
# --dp 8 \
76+
# --enable_fa3 \
77+
# --mem_fraction 0.8 \
78+
# --batch_max_tokens 4096 \
79+
# --mtp_draft_model_dir /mtc/DeepSeek-R1-NextN/ --mtp_mode deepseekv3 --mtp_step 1
80+
81+
82+
83+
# CUDA_VISIBLE_DEVICES=0,1 LOADWORKER=18 python -m lightllm.server.api_server --port 8019 \
84+
# --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ \
85+
# --tp 4 \
86+
# --enable_fa3 \
87+
# --nnodes 2 \
88+
# --node_rank 0 \
89+
# --nccl_host 127.0.0.1 \
90+
# --nccl_port 2732
91+
92+
# CUDA_VISIBLE_DEVICES=2,3 LOADWORKER=18 python -m lightllm.server.api_server --port 8021 \
93+
# --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ \
94+
# --tp 4 \
95+
# --enable_fa3 \
96+
# --nnodes 2 \
97+
# --node_rank 1 \
98+
# --nccl_host 127.0.0.1 \
99+
# --nccl_port 2732
100+
101+
102+
# python -m lightllm.server.api_server --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --run_mode "pd_master" --host 127.0.0.1 --port 60011
103+
104+
105+
# CUDA_VISIBLE_DEVICES=0,1 MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
106+
# --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ \
107+
# --run_mode "prefill" \
108+
# --tp 2 \
109+
# --dp 1 \
110+
# --host 0.0.0.0 \
111+
# --port 8019 \
112+
# --nccl_port 2732 \
113+
# --enable_fa3 \
114+
# --disable_cudagraph \
115+
# --pd_master_ip 127.0.0.1 \
116+
# --pd_master_port 60011
117+
118+
119+
# CUDA_VISIBLE_DEVICES=2,3 MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
120+
# --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ \
121+
# --run_mode "decode" \
122+
# --tp 2 \
123+
# --dp 1 \
124+
# --host 0.0.0.0 \
125+
# --port 8121 \
126+
# --nccl_port 27321 \
127+
# --enable_fa3 \
128+
# --pd_master_ip 127.0.0.1 \
129+
# --pd_master_port 60011
130+
131+
132+
# CUDA_VISIBLE_DEVICES=0,1 LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 4 --dp 4 --nccl_port 27321 --node_rank 0 --nnodes 2 | tee log.txt
133+
134+
# CUDA_VISIBLE_DEVICES=2,3 LOADWORKER=16 python -m lightllm.server.api_server --port 8011 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 4 --dp 4 --nccl_port 27321 --node_rank 1 --nnodes 2
135+
136+
137+
# LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 8 --dp 1 --enable_fa3
138+
139+
140+
141+
# lightllm v1.0.1-4209c8c4-deepep
142+
143+
144+
# docker run -itd --gpus all --privileged=true --shm-size=128G -v /mtc:/mtc --name wzj 44feca8a0c86
145+
146+
147+
# CUDA_VISIBLE_DEVICES=2,3 LOADWORKER=16 python -m lightllm.server.api_server --port 8011 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 4 --dp 4 --nccl_port 27321 --node_rank 1 --nnodes 2
148+
149+
150+
# LOADWORKER=16 python -m lightllm.server.api_server --port 8011 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 1 --dp 1 --nccl_port 27321 --enable_cpu_cache

0 commit comments

Comments
 (0)