refactor: update output_lse parameter to use std::optional in batch_prefill and batch_decode functions.

XuZhang99 · XuZhang99 · commit 1e1fba9cf27f · 2025-10-23T14:42:32.000+08:00
diff --git a/xllm/core/kernels/mlu/attention.cpp b/xllm/core/kernels/mlu/attention.cpp
@@ -32,7 +32,7 @@ void batch_prefill(const torch::Tensor& query,
                    const torch::Tensor& key,
                    const torch::Tensor& value,
                    torch::Tensor& output,
-                   torch::Tensor& output_lse,
+                   std::optional<torch::Tensor>& output_lse,
                    const std::optional<torch::Tensor>& query_start_loc,
                    const std::optional<torch::Tensor>& seq_start_loc,
                    const std::optional<torch::Tensor>& alibi_slope,
@@ -80,7 +80,7 @@ void batch_decode(const torch::Tensor& query,
                   const torch::Tensor& block_table,
                   const torch::Tensor& seq_lens,
                   const torch::Tensor& v_cache,
-                  torch::Tensor& output_lse,
+                  std::optional<torch::Tensor>& output_lse,
                   const std::optional<torch::Tensor>& q_quant_scale,
                   const std::optional<torch::Tensor>& k_cache_quant_scale,
                   const std::optional<torch::Tensor>& v_cache_quant_scale,
diff --git a/xllm/core/kernels/mlu/mlu_ops_api.h b/xllm/core/kernels/mlu/mlu_ops_api.h
@@ -62,7 +62,7 @@ void batch_prefill(const torch::Tensor& query,
                    const torch::Tensor& key,
                    const torch::Tensor& value,
                    torch::Tensor& output,
-                   torch::Tensor& output_lse,
+                   std::optional<torch::Tensor>& output_lse,
                    const std::optional<torch::Tensor>& query_start_loc,
                    const std::optional<torch::Tensor>& seq_start_loc,
                    const std::optional<torch::Tensor>& alibi_slope,
@@ -87,7 +87,7 @@ void batch_decode(const torch::Tensor& query,
                   const torch::Tensor& block_table,
                   const torch::Tensor& seq_lens,
                   const torch::Tensor& v_cache,
-                  torch::Tensor& output_lse,
+                  std::optional<torch::Tensor>& output_lse,
                   const std::optional<torch::Tensor>& q_quant_scale,
                   const std::optional<torch::Tensor>& k_cache_quant_scale,
                   const std::optional<torch::Tensor>& v_cache_quant_scale,
diff --git a/xllm/core/kernels/ops_api.cpp b/xllm/core/kernels/ops_api.cpp
@@ -71,12 +71,11 @@ void reshape_paged_cache(ReshapePagedCacheParams& params) {
 
 void batch_prefill(AttentionParams& params) {
 #if defined(USE_MLU)
-  torch::Tensor lse = params.output_lse.value_or(torch::Tensor());
   mlu::batch_prefill(params.query,
                      params.key,
                      params.value,
                      params.output,
-                     lse,
+                     params.output_lse,
                      params.query_start_loc,
                      params.seq_start_loc,
                      params.alibi_slope,
@@ -94,7 +93,6 @@ void batch_prefill(AttentionParams& params) {
                      params.window_size_right,
                      params.compute_dtype,
                      params.return_lse);
-  params.output_lse = lse;
 #elif defined(USE_CUDA)
   throw std::runtime_error("batch_prefill for cuda not implemented");
 #else
@@ -104,14 +102,13 @@ void batch_prefill(AttentionParams& params) {
 
 void batch_decode(AttentionParams& params) {
 #if defined(USE_MLU)
-  torch::Tensor lse = params.output_lse.value_or(torch::Tensor());
   mlu::batch_decode(params.query,
                     params.k_cache,
                     params.output,
                     params.block_table,
                     params.seq_lens,
                     params.v_cache,
-                    lse,
+                    params.output_lse,
                     params.q_quant_scale,
                     params.k_cache_quant_scale,
                     params.v_cache_quant_scale,
@@ -125,7 +122,6 @@ void batch_decode(AttentionParams& params) {
                     params.scale,
                     params.return_lse,
                     params.kv_cache_quant_bit_size);
-  params.output_lse = lse;
 #elif defined(USE_CUDA)
   throw std::runtime_error("batch_decode for cuda not implemented");
 #else