clean code

sufubao · sufubao · commit 0f433c0291e6 · 2025-12-12T08:27:39.000Z
diff --git a/lightllm/common/basemodel/layer_weights/hf_load_utils.py b/lightllm/common/basemodel/layer_weights/hf_load_utils.py
@@ -60,7 +60,7 @@ def load_hf_weights(data_type, weight_dir, pre_post_layer=None, transformer_laye
         transformer_layer_list=transformer_layer_list,
         weight_dir=weight_dir,
     )  # noqa
-    worker = int(os.environ.get("LOADWORKER", 16))
+    worker = int(os.environ.get("LOADWORKER", 1))
     with Pool(worker) as p:
         iterator = p.imap_unordered(partial_func, candidate_files, chunksize=1)
         desc_str = f"pid {os.getpid()} Loading model weights with {worker} workers"
diff --git a/lightllm/common/req_manager.py b/lightllm/common/req_manager.py
@@ -7,7 +7,6 @@
 from lightllm.common.basemodel.triton_kernel.gen_sampling_params import update_req_to_token_id_counter
 from lightllm.utils.envs_utils import enable_env_vars, get_env_start_args
 from lightllm.utils.config_utils import get_vocab_size
-from lightllm.server.router.dynamic_prompt.hybrid_radix_cache import HybridMemManager
 
 logger = init_logger(__name__)
 
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_local_cumsum_scalar/{B=1,BT=64,H=8,IS_VARLEN=true,REVERSE=false}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_local_cumsum_scalar/{B=1,BT=64,H=8,IS_VARLEN=true,REVERSE=false}_NVIDIA_H200.json
@@ -14,6 +14,9 @@
   "16": {
     "num_warps": 4
   },
+  "164096": {
+    "num_warps": 1
+  },
   "2048": {
     "num_warps": 2
   },
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/fused_gdn_gating:v1/{NUM_HEADS=8,a_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/fused_gdn_gating:v1/{NUM_HEADS=8,a_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -19,6 +19,10 @@
     "BLK_HEADS": 8,
     "num_warps": 2
   },
+  "164096": {
+    "BLK_HEADS": 8,
+    "num_warps": 1
+  },
   "2048": {
     "BLK_HEADS": 16,
     "num_warps": 1
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gated_rmsnorm_forward:v1/{N=128,has_bias=false,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gated_rmsnorm_forward:v1/{N=128,has_bias=false,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -7,6 +7,10 @@
     "BLOCK_N": 256,
     "num_warps": 1
   },
+  "1312768": {
+    "BLOCK_N": 64,
+    "num_warps": 2
+  },
   "16384": {
     "BLOCK_N": 128,
     "num_warps": 1
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_align_fused:v1/{topk_num=10}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_align_fused:v1/{topk_num=10}_NVIDIA_H200.json
@@ -31,6 +31,10 @@
     "BLOCK_SIZE": 128,
     "num_warps": 4
   },
+  "32768": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
   "4096": {
     "BLOCK_SIZE": 128,
     "num_warps": 8
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/silu_and_mul_fwd:v1/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/silu_and_mul_fwd:v1/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -59,6 +59,12 @@
     "NUM_STAGES": 2,
     "num_warps": 4
   },
+  "164096": {
+    "BLOCK_M": 64,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
   "2048": {
     "BLOCK_M": 1,
     "BLOCK_N": 256,
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
@@ -62,7 +62,7 @@ def autotune(
         as needed before invocation.
     """
 
-    def decorator(fn):
+    def decorator(fn: Callable) -> Callable:
         return Autotuner(
             fn=fn,
             kernel_name=kernel_name,
diff --git a/lightllm/models/qwen3next/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3next/layer_infer/transformer_layer_infer.py
@@ -271,14 +271,14 @@ def _linear_attn(
                 query_start_loc=infer_state.b1_cu_q_seq_len,
                 cache_indices=buffer_idx,
                 has_initial_state=infer_state.b_ready_cache_len > 0,
-                conv_states=conv_states.transpose(1, 2),
+                conv_states=conv_states,
                 activation=self.activation,
             )
             mixed_qkv = out_tensor.transpose(0, 1)
         else:
             mixed_qkv = causal_conv1d_update(
                 mixed_qkv,
-                conv_states.transpose(1, 2),
+                conv_states,
                 layer_weight.linear_conv1d.mm_param.weight.transpose(0, 1),
                 bias=layer_weight.linear_conv1d.mm_param.bias,
                 activation=self.activation,
diff --git a/lightllm/models/qwen3next/model.py b/lightllm/models/qwen3next/model.py
@@ -92,7 +92,7 @@ def _init_mem_manager(self):
             mtp_layer_num=start_args.mtp_step,
             full_attention_interval=self.config["full_attention_interval"],
             conv_state_dtype=self.data_type,
-            conv_state_shape=(conv_kernel_size - 1 + mtp_step, conv_dim // self.tp_world_size_),
+            conv_state_shape=(conv_dim // self.tp_world_size_, conv_kernel_size - 1 + mtp_step),
             ssm_state_dtype=ssm_dtype_dict[start_args.mamba_ssm_data_type],
             ssm_state_shape=(
                 # mtp_step + 1,
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
@@ -130,22 +130,6 @@ def get_model_name():
     return {"model_name": g_objs.args.model_name}
 
 
-@app.get("/get_server_info")
-@app.post("/get_server_info")
-def get_server_info():
-    # 将 StartArgs 转换为字典格式
-    from dataclasses import asdict
-
-    server_info: dict[str, Any] = asdict(g_objs.args)
-    return {**server_info}
-
-
-@app.get("/get_weight_version")
-@app.post("/get_weight_version")
-def get_weight_version():
-    return {"weight_version": g_objs.args.weight_version}
-
-
 @app.get("/healthz", summary="Check server health")
 @app.get("/health", summary="Check server health")
 @app.head("/health", summary="Check server health")
@@ -267,7 +251,7 @@ async def completions(request: CompletionRequest, raw_request: Request) -> Respo
         return create_error_response(
             HTTPStatus.EXPECTATION_FAILED, "service in pd mode dont recv reqs from http interface"
         )
-    logger.info(f"completions request: {request}")
+
     resp = await completions_impl(request, raw_request)
     return resp
 
diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
@@ -250,14 +250,13 @@ async def loop_for_fwd(
                         frozen_token_num = self.shared_token_load.get_frozened_token_count(d_i)
                         estimated_peak_token_count = self.shared_token_load.get_estimated_peak_token_count(d_i)
                         paused_req_num = self._get_paused_req_num_in_dp_index(dp_index=d_i)
-
                         logger.debug(
                             f"dp_i {d_i} current batch size: {len(self.running_batch.reqs)} \n"
                             f"dp_i {d_i} paused req num: {paused_req_num} \n"
                             f"dp_i {d_i} frozen token num: {frozen_token_num} \n"
                             f"dp_i {d_i} estimated_peak_token_count: {estimated_peak_token_count} \n"
                             f"dp_i {d_i} token used ratio: {token_ratio1} not contain prompt cache tree unrefed token\n"
-                            f"dp_i {d_i} token used ratio: {token_ratio2} contain prompt cache tree unrefed token\n"
+                            f"dp_i {d_i} token used ratio: {token_ratio2} contain prompt cache tree unrefed token"
                         )
                         self.metric_client.gauge_set("lightllm_batch_pause_size", paused_req_num)
                 # pd decode mode need to update token_load more frequently
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -34,8 +34,6 @@ class InferenceContext:
     overlap_stream: torch.cuda.Stream = None  # 一些情况下推理进程进行异步折叠操作的异步流对象。
     cpu_kv_cache_stream: torch.cuda.Stream = None  # 用 cpu kv cache 操作的 stream
 
-    use_hybrid_radix_cache: bool = False
-
     def register(
         self, backend, req_manager: ReqManager, radix_cache: RadixCache, shm_req_manager: ShmReqManager, vocab_size: int
     ):
@@ -408,7 +406,6 @@ def _match_radix_cache(self):
                 g_infer_context.req_manager.req_to_token_indexs[self.req_idx, 0:ready_cache_len] = value_tensor
                 self.cur_kv_len = int(ready_cache_len)  # 序列化问题, 该对象可能为numpy.int64，用 int(*)转换
                 self.shm_req.prompt_cache_len = self.cur_kv_len  # 记录 prompt cache 的命中长度
-                self.buffer_idx = share_node.buffer_idx
 
         self.shm_req.shm_cur_kv_len = self.cur_kv_len
         return
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -100,7 +100,6 @@ def init_model(self, kvargs):
         self.is_multinode_tp = self.args.nnodes > 1 and self.args.dp == 1
         self.is_nixl_pd_mode = self.run_mode in ["nixl_prefill", "nixl_decode"]
         self.is_nixl_decode_mode = self.run_mode == "nixl_decode"
-        self.is_hybrid_model = kvargs.get("is_hybrid_model", False)
 
         self.logger = init_logger(__name__)
 
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
@@ -110,7 +110,6 @@ def prefill_normal(
         model_input, run_reqs = prepare_prefill_inputs(
             prefill_reqs, is_chuncked_mode=not self.disable_chunked_prefill, is_multimodal=self.is_multimodal
         )
-
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
             model_output = self.model.forward(model_input)
             _, next_token_ids_cpu, next_token_logprobs_cpu = self._sample_and_scatter_token(
diff --git a/lightllm/utils/device_utils.py b/lightllm/utils/device_utils.py
@@ -90,7 +90,7 @@ def get_current_device_name():
         gpu_name = gpu_name.replace(" ", "_")
         return gpu_name
     else:
-        raise RuntimeError("No GPU available")
+        return None
 
 
 @lru_cache(maxsize=None)
diff --git a/lightllm/utils/log_utils.py b/lightllm/utils/log_utils.py
@@ -29,17 +29,6 @@ def format(self, record):
         return msg
 
 
-class RankFilter(logging.Filter):
-    def filter(self, record):
-        from lightllm.utils.dist_utils import get_current_rank_in_dp
-
-        try:
-            rank = get_current_rank_in_dp()
-            return rank == 0
-        except:
-            return False
-
-
 _root_logger = logging.getLogger("lightllm")
 _default_handler = None
 _default_file_handler = None
@@ -56,7 +45,6 @@ def _setup_logger():
         _default_handler = logging.StreamHandler(sys.stdout)
         _default_handler.flush = sys.stdout.flush  # type: ignore
         _default_handler.setLevel(_LOG_LEVEL)
-        # _default_handler.addFilter(RankFilter())
         _root_logger.addHandler(_default_handler)
 
     if _default_file_handler is None and _LOG_DIR is not None:
@@ -68,7 +56,6 @@ def _setup_logger():
         _default_file_handler = logging.FileHandler(_LOG_DIR + "/default.log")
         _default_file_handler.setLevel(_LOG_LEVEL)
         _default_file_handler.setFormatter(fmt)
-        # _default_file_handler.addFilter(RankFilter())
         _root_logger.addHandler(_default_file_handler)
 
     _default_handler.setFormatter(fmt)