feat: add DiT data structures.

yiming-l21 · xiao-yu-chen · commit ae7eff1af5ee · 2025-09-18T17:28:52.000+08:00
diff --git a/xllm/api_service/api_service_impl.h b/xllm/api_service/api_service_impl.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <memory>
 
 #include "call.h"
+#include "core/runtime/dit_master.h"
 #include "core/runtime/llm_master.h"
-
 namespace xllm {
 
 template <typename T>
diff --git a/xllm/core/framework/batch/batch.h b/xllm/core/framework/batch/batch.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
+#include "framework/request/dit_request_params.h"
 #include "framework/request/mm_data.h"
 #include "framework/request/request.h"
 #include "framework/request/sequence.h"
diff --git a/xllm/core/framework/request/request.cpp b/xllm/core/framework/request/request.cpp
@@ -48,6 +48,23 @@ Request::Request(const std::string& request_id,
       slo_ms_(slo_ms) {
   create_sequences_group();
 }
+Request::Request(const std::string& request_id,
+                 const std::string& x_request_id,
+                 const std::string& x_request_time,
+                 const DITRequestState& state,
+                 const std::string& service_request_id,
+                 bool offline,
+                 int32_t slo_ms,
+                 RequestPriority priority)
+    : request_id_(request_id),
+      service_request_id_(service_request_id),
+      x_request_id_(x_request_id),
+      x_request_time_(x_request_time),
+      dit_state_(std::move(state)),
+      created_time_(absl::Now()),
+      offline_(offline),
+      priority_(priority),
+      slo_ms_(slo_ms) {}
 
 void Request::create_sequences_group() {
   SequenceParams sequence_params;
diff --git a/xllm/core/framework/request/request.h b/xllm/core/framework/request/request.h
@@ -43,6 +43,14 @@ class Request {
           bool offline = false,
           int32_t slo_ms = 0,
           RequestPriority priority = RequestPriority::NORMAL);
+  Request(const std::string& request_id,
+          const std::string& x_request_id,
+          const std::string& x_request_time,
+          const DITRequestState& state,
+          const std::string& service_request_id = "",
+          bool offline = false,
+          int32_t slo_ms = 0,
+          RequestPriority priority = RequestPriority::NORMAL);
 
   bool finished() const;
 
@@ -93,7 +101,7 @@ class Request {
   const RequestPriority priority() const { return priority_; }
 
   RequestState& state() { return state_; }
-
+  DITRequestState& dit_state() { return dit_state_; }
   void update_connection_status();
 
   bool check_beam_search() const {
@@ -115,7 +123,7 @@ class Request {
   std::string x_request_time_;
 
   RequestState state_;
-
+  DITRequestState dit_state_;
   // list of sequences to generate completions for the prompt
   // use deque instead of vector to avoid no-copy move for Sequence
   //  std::deque<Sequence> sequences;
diff --git a/xllm/core/framework/request/request_output.cpp b/xllm/core/framework/request/request_output.cpp
@@ -51,4 +51,34 @@ void RequestOutput::log_request_status() const {
   }
 }
 
+void ImageRequestOutput::log_request_status() const {
+  if (!status.has_value()) {
+    return;
+  }
+
+  auto code = status.value().code();
+  switch (code) {
+    case StatusCode::OK:
+      COUNTER_INC(request_status_total_ok);
+      break;
+    case StatusCode::CANCELLED:
+      COUNTER_INC(request_status_total_cancelled);
+      break;
+    case StatusCode::UNKNOWN:
+      COUNTER_INC(request_status_total_unknown);
+      break;
+    case StatusCode::INVALID_ARGUMENT:
+      COUNTER_INC(request_status_total_invalid_argument);
+      break;
+    case StatusCode::DEADLINE_EXCEEDED:
+      COUNTER_INC(request_status_total_deadline_exceeded);
+      break;
+    case StatusCode::RESOURCE_EXHAUSTED:
+      COUNTER_INC(request_status_total_resource_exhausted);
+      break;
+    default:
+      COUNTER_INC(request_status_total_unknown);
+      break;
+  }
+}
 }  // namespace xllm
diff --git a/xllm/core/framework/request/request_output.h b/xllm/core/framework/request/request_output.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "core/common/types.h"
+#include "image_generation.pb.h"
 
 namespace xllm {
 struct Usage {
@@ -75,6 +76,22 @@ struct SequenceOutput {
   // the embeddings of the prompt token
   std::optional<std::vector<float>> embeddings;
 };
+struct ImageGenerationOutput {
+  // the index of the sequence in the request.
+  size_t index;
+
+  // the generated image in proto tensor format.
+  proto::Tensor image_tensor;
+
+  // the height of the generated image.
+  int32_t height;
+
+  // the width of the generated image.
+  int32_t width;
+
+  // seed used for image generation.
+  int64_t seed;
+};
 
 struct RequestOutput {
   RequestOutput() = default;
@@ -108,10 +125,44 @@ struct RequestOutput {
   bool cancelled = false;
 };
 
+struct ImageRequestOutput {
+  ImageRequestOutput() = default;
+
+  ImageRequestOutput(Status&& _status) : status(std::move(_status)) {}
+
+  void log_request_status() const;
+
+  // the id of the request.
+  std::string request_id;
+
+  // the id of the request which generated in xllm service.
+  std::string service_request_id;
+
+  // the status of the request.
+  std::optional<Status> status;
+
+  // the output for each sequence in the request.
+  std::vector<ImageGenerationOutput> outputs;
+
+  // whether the request is finished.
+  bool finished = false;
+
+  // whether the request is cancelled.
+  bool cancelled = false;
+};
+
 // callback function for output, return true to continue, false to stop/cancel
 using OutputCallback = std::function<bool(RequestOutput output)>;
-
+// callback function for image request output, return true to continue, false to
+// stop/cancel
+using ImageOutputCallback = std::function<bool(ImageRequestOutput output)>;
+// callback function for batch output, return true to continue, false to
+// stop/cancel
 using BatchOutputCallback =
     std::function<bool(size_t index, RequestOutput output)>;
+// callback function for batch image output, return true to continue, false to
+// stop/cancel
+using BatchImageOutputCallback =
+    std::function<bool(size_t index, ImageRequestOutput output)>;
 
 }  // namespace xllm
diff --git a/xllm/core/framework/request/request_params.cpp b/xllm/core/framework/request/request_params.cpp
@@ -39,6 +39,11 @@ std::string generate_chat_request_id() {
          short_uuid.random();
 }
 
+std::string generate_image_generation_request_id() {
+  return "imggen-" + InstanceName::name()->get_name_hash() + "-" +
+         short_uuid.random();
+}
+
 }  // namespace
 
 RequestParams::RequestParams(const proto::CompletionRequest& request,
@@ -332,6 +337,76 @@ RequestParams::RequestParams(const proto::EmbeddingRequest& request,
   streaming = false;
 }
 
+ImageRequestParams::ImageRequestParams(
+    const proto::ImageGenerationRequest& request,
+    const std::string& x_rid,
+    const std::string& x_rtime) {
+  request_id = generate_image_generation_request_id();
+  x_request_id = x_rid;
+  x_request_time = x_rtime;
+  model = request.model();
+  if (request.has_service_request_id()) {
+    service_request_id = request.service_request_id();
+  }
+  const auto& proto_input = request.input();
+  input_params.prompt = proto_input.prompt();
+  if (proto_input.has_prompt_2()) {
+    input_params.prompt_2 = proto_input.prompt_2();
+  }
+  if (proto_input.has_negative_prompt()) {
+    input_params.negative_prompt = proto_input.negative_prompt();
+  }
+  if (proto_input.has_negative_prompt_2()) {
+    input_params.negative_prompt_2 = proto_input.negative_prompt_2();
+  }
+  if (proto_input.has_prompt_embeds()) {
+    const auto& proto_tensor = proto_input.prompt_embeds();
+    input_params.prompt_embeds = proto_tensor;
+  }
+  if (proto_input.has_pooled_prompt_embeds()) {
+    input_params.pooled_prompt_embeds = proto_input.pooled_prompt_embeds();
+  }
+  if (proto_input.has_negative_prompt_embeds()) {
+    input_params.negative_prompt_embeds = proto_input.negative_prompt_embeds();
+  }
+  if (proto_input.has_negative_pooled_prompt_embeds()) {
+    input_params.negative_pooled_prompt_embeds =
+        proto_input.negative_pooled_prompt_embeds();
+  }
+  if (proto_input.has_latents()) {
+    const auto& proto_tensor = proto_input.latents();
+    input_params.latents = proto_tensor;
+  }
+  const auto& proto_params = request.parameters();
+  if (proto_params.has_size()) {
+    generation_params.size = proto_params.size();
+  }
+  if (proto_params.has_num_inference_steps()) {
+    generation_params.num_inference_steps = proto_params.num_inference_steps();
+  }
+  if (proto_params.has_true_cfg_scale()) {
+    generation_params.true_cfg_scale = proto_params.true_cfg_scale();
+  }
+  if (proto_params.has_guidance_scale()) {
+    generation_params.guidance_scale = proto_params.guidance_scale();
+  }
+  if (proto_params.has_num_images_per_prompt()) {
+    generation_params.num_images_per_prompt =
+        static_cast<uint32_t>(proto_params.num_images_per_prompt());
+  } else {
+    generation_params.num_images_per_prompt = 1;
+  }
+  if (proto_params.has_seed()) {
+    generation_params.seed = proto_params.seed();
+  }
+  if (proto_params.has_max_sequence_length()) {
+    generation_params.max_sequence_length = proto_params.max_sequence_length();
+  }
+}
+bool ImageRequestParams::verify_params(
+    std::function<bool(ImageRequestOutput)> callback) const {
+  return true;
+}
 bool RequestParams::verify_params(OutputCallback callback) const {
   if (n == 0) {
     CALLBACK_WITH_ERROR(StatusCode::INVALID_ARGUMENT,
diff --git a/xllm/core/framework/request/request_params.h b/xllm/core/framework/request/request_params.h
@@ -27,7 +27,9 @@ limitations under the License.
 #include "completion.pb.h"
 #include "core/common/macros.h"
 #include "core/common/types.h"
+#include "dit_request_params.h"
 #include "embedding.pb.h"
+#include "image_generation.pb.h"
 #include "multimodal.pb.h"
 #include "request.h"
 #include "request_output.h"
@@ -139,4 +141,32 @@ struct RequestParams {
   nlohmann::json chat_template_kwargs = nlohmann::json::object();
 };
 
+struct ImageRequestParams {
+  ImageRequestParams() = default;
+  ImageRequestParams(const proto::ImageGenerationRequest& request,
+                     const std::string& x_rid,
+                     const std::string& x_rtime);
+
+  bool verify_params(ImageOutputCallback callback) const;
+
+  // request id
+  std::string request_id;
+  std::string service_request_id = "";
+  std::string x_request_id;
+  std::string x_request_time;
+
+  std::string model;
+
+  bool offline = false;
+
+  int32_t slo_ms = 0;
+
+  RequestPriority priority = RequestPriority::NORMAL;
+
+  InputParams input_params;
+  // Mandatory: Generation control parameters (encapsulates all fields related
+  // to "image generation process")
+  GenerationParams generation_params;
+};
+
 }  // namespace xllm
diff --git a/xllm/core/framework/request/request_state.h b/xllm/core/framework/request/request_state.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "core/framework/sampling/sampling_params.h"
+#include "dit_request_params.h"
 #include "mm_data.h"
 #include "request_output.h"
 #include "stopping_checker.h"
@@ -150,4 +151,18 @@ struct RequestState final {
   std::optional<Call*> call_;
 };
 
+struct DITRequestState {
+ public:
+  DITRequestState(InputParams&& input_params,
+                  GenerationParams&& generation_params)
+      : input_params_(std::move(input_params)),
+        generation_params_(std::move(generation_params)) {}
+  DITRequestState() {}
+  InputParams& input_params() { return input_params_; }
+  GenerationParams& generation_params() { return generation_params_; }
+
+ private:
+  InputParams input_params_;
+  GenerationParams generation_params_;
+};
 }  // namespace xllm
diff --git a/xllm/core/runtime/forward_params.h b/xllm/core/runtime/forward_params.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "common/types.h"
 #include "framework/model/model_input_params.h"
 #include "framework/sampling/sampling_params.h"
-
+#include "tensor.pb.h"
 namespace xllm {
 
 class WorkerType {
@@ -33,6 +33,7 @@ class WorkerType {
     INVALID = 0,
     LLM,   // LLM
     VLM,   // VLM
+    DIT,   // DIT
     ELM,   // Embedding LM
     EVLM,  // Embedding VLM
   };
@@ -43,6 +44,8 @@ class WorkerType {
       value_ = LLM;
     } else if (str == "VLM") {
       value_ = VLM;
+    } else if (str == "DIT") {
+      value_ = DIT;
     } else if (str == "ELM") {
       value_ = ELM;
     } else if (str == "EVLM") {
@@ -67,6 +70,8 @@ class WorkerType {
       return "LLM";
     } else if (this->value_ == VLM) {
       return "VLM";
+    } else if (this->value_ == DIT) {
+      return "DIT";
     } else if (this->value_ == ELM) {
       return "ELM";
     } else if (this->value_ == EVLM) {
@@ -118,6 +123,9 @@ struct ForwardOutput {
   torch::Tensor expert_load_data;
 
   int32_t prepared_layer_id;
+
+  // dit related output
+  torch::Tensor image;
 };
 
 // Model input with raw data, which will be
diff --git a/xllm/core/runtime/worker.cpp b/xllm/core/runtime/worker.cpp