1+ import torch
2+ import numpy as np
3+ from torch .profiler import profile , record_function , ProfilerActivity
4+
5+ data_o = torch .zeros ((128 * 1024 ), dtype = torch .int32 , device = "cuda" )
6+ in_data = list (range (0 , 1000 ))
7+ in_datas = [list (range (0 , 1000 )) for _ in range (100 )]
8+ import time
9+
10+ cpu_tensor = torch .zeros ((128 * 1024 ), dtype = torch .int32 , device = "cpu" , pin_memory = False )
11+ pin_mem_tensor = torch .zeros ((128 * 1024 ), dtype = torch .int32 , device = "cpu" , pin_memory = True )
12+ gpu_tensor = torch .zeros ((128 * 1024 ), dtype = torch .int32 , device = "cuda" )
13+
14+ a = torch .arange (0 , 10 ).cuda ()
15+ b = torch .arange (0 , 10 ).cuda ()
16+
17+ print ((gpu_tensor == 1 ).dtype )
18+ # max_data = tmp.max()
19+ with profile (
20+ activities = [ProfilerActivity .CPU , ProfilerActivity .CUDA ],
21+ record_shapes = False ,
22+ profile_memory = False ,
23+ on_trace_ready = torch .profiler .tensorboard_trace_handler ("./profile/profile.file" ),
24+ ) as prof :
25+ # gpu_tensor[:] = pin_mem_tensor
26+ # torch.cuda.current_stream().synchronize()
27+ # a = torch.tensor([1,3, 7], device="cuda")
28+ # gpu_tensor[:] = pin_mem_tensor
29+ for _ in range (100 ):
30+ cpu_tensor .cuda (non_blocking = True )
31+
32+ print (prof .key_averages ().table (sort_by = "cuda_time_total" , row_limit = 16 ), flush = True )
33+
34+
35+ # CUDA_VISIBLE_DEVICES=4,5,6,7 LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 4 --dp 1 --diverse_mode | tee log.txt
36+
37+ # CUDA_VISIBLE_DEVICES=4,5,6,7 LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 1 --dp 1 --diverse_mode | tee log.txt 你试试这个
38+
39+
40+ # CUDA_VISIBLE_DEVICES=4,5,6,7 LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 4 --dp 1 --output_constraint_mode xgrammar | tee log.txt
41+
42+
43+ pin_mem_tensor .numpy ()[0 :10 ] = list (range (10 ))
44+
45+ print ("ok" )
46+
47+ # CUDA_VISIBLE_DEVICES=4,5,6,7 LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 8 --dp 8 | tee log.txt 你试试这个
48+
49+
50+
51+
52+ # LOADWORKER=16 python -m lightllm.server.api_server --model_dir /mtc/DeepSeek-R1 --mtp_draft_model_dir /mtc/DeepSeek-R1-NextN/ --mtp_mode deepseekv3 --mtp_step 1 --enable_fa3 --graph_max_batch_size 64 --tp 8 --port 15001 | tee debug.txt
53+
54+
55+ # LOADWORKER=16 python -m lightllm.server.api_server --model_dir /mtc/DeepSeek-R1 --mtp_draft_model_dir /mtc/DeepSeek-R1-NextN/ --mtp_mode deepseekv3 --mtp_step 1 --enable_fa3 --graph_max_batch_size 64 --tp 8 --port 15001 | tee debug.txt
56+
57+
58+ # LOADWORKER=16 python -m lightllm.server.api_server --model_dir /mtc/DeepSeek-R1 --mtp_draft_model_dir /mtc/DeepSeek-R1-NextN/ --mtp_mode deepseekv3 --mtp_step 1 --enable_fa3 --graph_max_batch_size 64 --tp 8 --port 15001 | tee debug.txt
59+
60+
61+ # MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8019 \
62+ # --model_dir /mtc/DeepSeek-R1 \
63+ # --tp 8 \
64+ # --dp 8 \
65+ # --enable_fa3 \
66+ # --enable_prefill_microbatch_overlap \
67+ # --enable_decode_microbatch_overlap \
68+ # --mem_fraction 0.8 \
69+ # --batch_max_tokens 4096
70+
71+
72+ # MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server --port 8019 \
73+ # --model_dir /mtc/DeepSeek-R1 \
74+ # --tp 8 \
75+ # --dp 8 \
76+ # --enable_fa3 \
77+ # --mem_fraction 0.8 \
78+ # --batch_max_tokens 4096 \
79+ # --mtp_draft_model_dir /mtc/DeepSeek-R1-NextN/ --mtp_mode deepseekv3 --mtp_step 1
80+
81+
82+
83+ # CUDA_VISIBLE_DEVICES=0,1 LOADWORKER=18 python -m lightllm.server.api_server --port 8019 \
84+ # --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ \
85+ # --tp 4 \
86+ # --enable_fa3 \
87+ # --nnodes 2 \
88+ # --node_rank 0 \
89+ # --nccl_host 127.0.0.1 \
90+ # --nccl_port 2732
91+
92+ # CUDA_VISIBLE_DEVICES=2,3 LOADWORKER=18 python -m lightllm.server.api_server --port 8021 \
93+ # --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ \
94+ # --tp 4 \
95+ # --enable_fa3 \
96+ # --nnodes 2 \
97+ # --node_rank 1 \
98+ # --nccl_host 127.0.0.1 \
99+ # --nccl_port 2732
100+
101+
102+ # python -m lightllm.server.api_server --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --run_mode "pd_master" --host 127.0.0.1 --port 60011
103+
104+
105+ # CUDA_VISIBLE_DEVICES=0,1 MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
106+ # --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ \
107+ # --run_mode "prefill" \
108+ # --tp 2 \
109+ # --dp 1 \
110+ # --host 0.0.0.0 \
111+ # --port 8019 \
112+ # --nccl_port 2732 \
113+ # --enable_fa3 \
114+ # --disable_cudagraph \
115+ # --pd_master_ip 127.0.0.1 \
116+ # --pd_master_port 60011
117+
118+
119+ # CUDA_VISIBLE_DEVICES=2,3 MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
120+ # --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ \
121+ # --run_mode "decode" \
122+ # --tp 2 \
123+ # --dp 1 \
124+ # --host 0.0.0.0 \
125+ # --port 8121 \
126+ # --nccl_port 27321 \
127+ # --enable_fa3 \
128+ # --pd_master_ip 127.0.0.1 \
129+ # --pd_master_port 60011
130+
131+
132+ # CUDA_VISIBLE_DEVICES=0,1 LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 4 --dp 4 --nccl_port 27321 --node_rank 0 --nnodes 2 | tee log.txt
133+
134+ # CUDA_VISIBLE_DEVICES=2,3 LOADWORKER=16 python -m lightllm.server.api_server --port 8011 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 4 --dp 4 --nccl_port 27321 --node_rank 1 --nnodes 2
135+
136+
137+ # LOADWORKER=16 python -m lightllm.server.api_server --port 8019 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 8 --dp 1 --enable_fa3
138+
139+
140+
141+ # lightllm v1.0.1-4209c8c4-deepep
142+
143+
144+ # docker run -itd --gpus all --privileged=true --shm-size=128G -v /mtc:/mtc --name wzj 44feca8a0c86
145+
146+
147+ # CUDA_VISIBLE_DEVICES=2,3 LOADWORKER=16 python -m lightllm.server.api_server --port 8011 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 4 --dp 4 --nccl_port 27321 --node_rank 1 --nnodes 2
148+
149+
150+ # LOADWORKER=16 python -m lightllm.server.api_server --port 8011 --model_dir /mtc/niushengxiao/Qwen/Qwen2.5-14B-Instruct/ --tp 1 --dp 1 --nccl_port 27321 --enable_cpu_cache
0 commit comments