mlc-ai
diff --git a/‎cpp/json_ffi/json_ffi_engine.cc‎
Lines changed: 51 additions & 21 deletions b/‎cpp/json_ffi/json_ffi_engine.cc‎
Lines changed: 51 additions & 21 deletions
diff --git a/‎cpp/json_ffi/json_ffi_engine.h‎
Lines changed: 0 additions & 2 deletions b/‎cpp/json_ffi/json_ffi_engine.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎cpp/json_ffi/openai_api_protocol.cc‎
Lines changed: 20 additions & 0 deletions b/‎cpp/json_ffi/openai_api_protocol.cc‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎cpp/json_ffi/openai_api_protocol.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/json_ffi/openai_api_protocol.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/serve/config.cc‎
Lines changed: 50 additions & 24 deletions b/‎cpp/serve/config.cc‎
Lines changed: 50 additions & 24 deletions
diff --git a/‎cpp/serve/config.h‎
Lines changed: 25 additions & 7 deletions b/‎cpp/serve/config.h‎
Lines changed: 25 additions & 7 deletions
@@ -57,28 +57,33 @@ bool JSONFFIEngine::AddRequest(std::string request_json_str, std::string request
     return false;
   }
   ChatCompletionRequest request = request_res.Unwrap();
-  // get prompt: note, assistant was appended in the end.
-  Result<std::vector<Data>> inputs_obj =
-      CreatePrompt(this->conv_template_, request, this->model_config_, this->device_);
-  if (inputs_obj.IsErr()) {
-    err_ = inputs_obj.UnwrapErr();
-    return false;
-  }
-  Array<Data> inputs = inputs_obj.Unwrap();
-
-  // generation_cfg
+  Array<Data> inputs;
   Array<String> stop_strs;
-  stop_strs.reserve(this->conv_template_.stop_str.size());
-  for (const std::string& stop_str : this->conv_template_.stop_str) {
-    stop_strs.push_back(stop_str);
-  }
-  if (request.stop.has_value()) {
-    stop_strs.reserve(stop_strs.size() + request.stop.value().size());
-    for (const std::string& stop_str : request.stop.value()) {
+  bool is_special_request =
+      (request.debug_config.has_value() &&
+       request.debug_config.value().special_request != SpecialRequestKind::kNone);
+  // special request does not have to go through prompt construction
+  if (!is_special_request) {
+    // get prompt: note, assistant was appended in the end.
+    Result<std::vector<Data>> inputs_obj =
+        CreatePrompt(this->conv_template_, request, this->model_config_, this->device_);
+    if (inputs_obj.IsErr()) {
+      err_ = inputs_obj.UnwrapErr();
+      return false;
+    }
+    inputs = inputs_obj.Unwrap();
+
+    stop_strs.reserve(this->conv_template_.stop_str.size());
+    for (const std::string& stop_str : this->conv_template_.stop_str) {
       stop_strs.push_back(stop_str);
     }
+    if (request.stop.has_value()) {
+      stop_strs.reserve(stop_strs.size() + request.stop.value().size());
+      for (const std::string& stop_str : request.stop.value()) {
+        stop_strs.push_back(stop_str);
+      }
+    }
   }
-
   // create a generation config from request
   const auto& default_gen_cfg = default_generation_config_;
   auto gen_cfg = tvm::runtime::make_object<GenerationConfigNode>();
@@ -115,8 +120,6 @@ bool JSONFFIEngine::Abort(std::string request_id) {
 
 std::string JSONFFIEngine::GetLastError() { return err_; }
 
-std::string JSONFFIEngine::JSONMetrics() { return this->engine_->JSONMetrics(); }
-
 void JSONFFIEngine::ExitBackgroundLoop() { this->engine_->ExitBackgroundLoop(); }
 
 JSONFFIEngine::~JSONFFIEngine() { this->ExitBackgroundLoop(); }
@@ -131,7 +134,6 @@ class JSONFFIEngineImpl : public JSONFFIEngine, public ModuleNode {
   TVM_MODULE_VTABLE_ENTRY("chat_completion", &JSONFFIEngineImpl::ChatCompletion);
   TVM_MODULE_VTABLE_ENTRY("abort", &JSONFFIEngineImpl::Abort);
   TVM_MODULE_VTABLE_ENTRY("get_last_error", &JSONFFIEngineImpl::GetLastError);
-  TVM_MODULE_VTABLE_ENTRY("json_metrics", &JSONFFIEngineImpl::JSONMetrics);
   TVM_MODULE_VTABLE_ENTRY("run_background_loop", &JSONFFIEngineImpl::RunBackgroundLoop);
   TVM_MODULE_VTABLE_ENTRY("run_background_stream_back_loop",
                           &JSONFFIEngineImpl::RunBackgroundStreamBackLoop);
@@ -190,11 +192,35 @@ class JSONFFIEngineImpl : public JSONFFIEngine, public ModuleNode {
 
   String GetResponseFromStreamOutput(Array<RequestStreamOutput> delta_outputs) {
     std::unordered_map<std::string, std::vector<ChatCompletionStreamResponseChoice>> response_map;
+    std::vector<picojson::value> request_final_usage_messages;
+    std::string model = "json_ffi";
+
     for (const auto& delta_output : delta_outputs) {
       std::string request_id = delta_output->request_id;
       if (response_map.find(request_id) == response_map.end()) {
         response_map[request_id] = std::vector<ChatCompletionStreamResponseChoice>();
       }
+
+      // build the final usage messages
+      // invariant, we can always let other messages to come first
+      // then the final usage messages, as final usage is always last
+      if (delta_output->request_final_usage_json_str.defined()) {
+        ChatCompletionStreamResponse response;
+        response.id = request_id;
+        response.model = model;
+        response.system_fingerprint = "";
+        std::string usage_json_str = delta_output->request_final_usage_json_str.value();
+        picojson::value usage_json;
+        std::string err = picojson::parse(usage_json, usage_json_str);
+        if (!err.empty()) {
+          err_ = err;
+        } else {
+          response.usage = usage_json;
+        }
+        request_final_usage_messages.push_back(picojson::value(response.AsJSON()));
+        continue;
+      }
+      ICHECK_NE(delta_output->group_finish_reason.size(), 0);
       ChatCompletionStreamResponseChoice choice;
 
       if (delta_output->group_finish_reason.size() != 1) {
@@ -232,13 +258,17 @@ class JSONFFIEngineImpl : public JSONFFIEngine, public ModuleNode {
 
     picojson::array response_arr;
     for (const auto& [request_id, choices] : response_map) {
+      if (choices.size() == 0) continue;
       ChatCompletionStreamResponse response;
       response.id = request_id;
       response.choices = choices;
       response.model = "json_ffi";  // TODO: Return model name from engine (or from args)
       response.system_fingerprint = "";
       response_arr.push_back(picojson::value(response.AsJSON()));
     }
+    for (auto&& item : request_final_usage_messages) {
+      response_arr.emplace_back(std::move(item));
+    }
     return picojson::value(response_arr).serialize();
   }
 };
 
@@ -41,8 +41,6 @@ class JSONFFIEngine {
 
   std::string GetLastError();
 
-  std::string JSONMetrics();
-
   void ExitBackgroundLoop();
 
  protected:
 
@@ -387,6 +387,21 @@ Result<ChatCompletionRequest> ChatCompletionRequest::FromJSON(const std::string&
     request.tools = tools;
   }
 
+  // debug_config
+  Result<std::optional<picojson::object>> debug_config_opt_res =
+      json::LookupOptionalWithResultReturn<picojson::object>(json_obj, "debug_config");
+  if (debug_config_opt_res.IsErr()) {
+    return TResult::Error(debug_config_opt_res.UnwrapErr());
+  }
+  auto debug_config_opt = debug_config_opt_res.Unwrap();
+  if (debug_config_opt.has_value()) {
+    Result<DebugConfig> debug_config_res = DebugConfig::FromJSON(debug_config_opt.value());
+    if (debug_config_res.IsErr()) {
+      return TResult::Error(debug_config_res.UnwrapErr());
+    }
+    request.debug_config = debug_config_res.Unwrap();
+  }
+
   // TODO: Other parameters
   return TResult::Ok(request);
 }
@@ -485,15 +500,20 @@ picojson::object ChatCompletionResponse::AsJSON() const {
 picojson::object ChatCompletionStreamResponse::AsJSON() const {
   picojson::object obj;
   obj["id"] = picojson::value(this->id);
+
   picojson::array choices_arr;
   for (const auto& choice : this->choices) {
     choices_arr.push_back(picojson::value(choice.AsJSON()));
   }
   obj["choices"] = picojson::value(choices_arr);
+
   obj["created"] = picojson::value((int64_t)this->created);
   obj["model"] = picojson::value(this->model);
   obj["system_fingerprint"] = picojson::value(this->system_fingerprint);
   obj["object"] = picojson::value(this->object);
+  if (usage.has_value()) {
+    obj["usage"] = usage.value();
+  }
   return obj;
 }
 
 
@@ -200,6 +200,7 @@ class ChatCompletionStreamResponse {
   std::string model;
   std::string system_fingerprint;
   std::string object = "chat.completion.chunk";
+  std::optional<picojson::value> usage;
 
   picojson::object AsJSON() const;
 };
 
@@ -19,6 +19,42 @@ namespace mlc {
 namespace llm {
 namespace serve {
 
+/****************** DebugConfig ******************/
+
+Result<DebugConfig> DebugConfig::FromJSON(const picojson::object& config) {
+  using TResult = Result<DebugConfig>;
+  DebugConfig res;
+  res.ignore_eos = json::LookupOrDefault<bool>(config, "ignore_eos", false);
+  res.pinned_system_prompt = json::LookupOrDefault<bool>(config, "pinned_system_prompt", false);
+  std::string special_request = json::LookupOrDefault<std::string>(config, "special_request", "");
+  if (special_request.length() != 0) {
+    if (special_request == "query_engine_metrics") {
+      res.special_request = SpecialRequestKind::kQueryEngineMetrics;
+    } else {
+      return TResult::Error("Uknown special request " + special_request);
+    }
+  }
+  return TResult::Ok(res);
+}
+
+/**
+ * \return serialized json value of the config.
+ */
+picojson::object DebugConfig::AsJSON() const {
+  picojson::object config;
+  config["ignore_eos"] = picojson::value(ignore_eos);
+  config["pinned_system_prompt"] = picojson::value(pinned_system_prompt);
+  switch (special_request) {
+    case SpecialRequestKind::kQueryEngineMetrics: {
+      config["special_request"] = picojson::value("query_engine_metrics");
+      break;
+    }
+    case SpecialRequestKind::kNone:
+      break;
+  }
+  return config;
+}
+
 /****************** GenerationConfig ******************/
 
 TVM_REGISTER_OBJECT_TYPE(GenerationConfigNode);
@@ -55,12 +91,10 @@ Result<GenerationConfig> GenerationConfig::Validate(GenerationConfig cfg) {
   return TResult::Ok(cfg);
 }
 
-Result<GenerationConfig> GenerationConfig::FromJSON(String config_json_str,
+Result<GenerationConfig> GenerationConfig::FromJSON(const picojson::object& config,
                                                     const GenerationConfig& default_config) {
   using TResult = Result<GenerationConfig>;
-  picojson::object config = json::ParseToJSONObject(config_json_str);
   ObjectPtr<GenerationConfigNode> n = make_object<GenerationConfigNode>();
-
   n->n = json::LookupOrDefault<int64_t>(config, "n", default_config->n);
   n->temperature =
       json::LookupOrDefault<double>(config, "temperature", default_config->temperature);
@@ -144,18 +178,21 @@ Result<GenerationConfig> GenerationConfig::FromJSON(String config_json_str,
   // "debug_config" is for internal usage. Not the part of OpenAI API spec.
   std::optional<picojson::object> debug_config_obj =
       json::LookupOptional<picojson::object>(config, "debug_config");
+
   if (debug_config_obj.has_value()) {
-    n->debug_config.pinned_system_prompt =
-        json::LookupOrDefault<bool>(debug_config_obj.value(), "pinned_system_prompt", false);
-    n->debug_config.ignore_eos =
-        json::LookupOrDefault<bool>(debug_config_obj.value(), "ignore_eos", false);
+    Result<DebugConfig> debug_config_res = DebugConfig::FromJSON(debug_config_obj.value());
+    if (debug_config_res.IsErr()) {
+      return TResult::Error(debug_config_res.UnwrapErr());
+    }
+    n->debug_config = debug_config_res.Unwrap();
   }
   return Validate(GenerationConfig(n));
 }
 
 GenerationConfig GenerationConfig::GetDefaultFromModelConfig(
     const picojson::object& model_config_json) {
   ObjectPtr<GenerationConfigNode> n = make_object<GenerationConfigNode>();
+  n->max_tokens = -1;
   n->temperature = json::LookupOrDefault<double>(model_config_json, "temperature", n->temperature);
   n->top_p = json::LookupOrDefault<double>(model_config_json, "top_p", n->top_p);
   n->frequency_penalty =
@@ -165,7 +202,7 @@ GenerationConfig GenerationConfig::GetDefaultFromModelConfig(
   return GenerationConfig(n);
 }
 
-String GenerationConfigNode::AsJSONString() const {
+picojson::object GenerationConfigNode::AsJSON() const {
   picojson::object config;
   config["n"] = picojson::value(static_cast<int64_t>(this->n));
   config["temperature"] = picojson::value(this->temperature);
@@ -202,17 +239,8 @@ String GenerationConfigNode::AsJSONString() const {
                                   ? picojson::value(this->response_format.schema.value())
                                   : picojson::value();
   config["response_format"] = picojson::value(response_format);
-
-  // Params for internal usage. Not the part of OpenAI API spec.
-  {
-    picojson::object debug_config_obj;
-    debug_config_obj["pinned_system_prompt"] =
-        picojson::value(this->debug_config.pinned_system_prompt);
-    debug_config_obj["ignore_eos"] = picojson::value(this->debug_config.ignore_eos);
-    config["debug_config"] = picojson::value(debug_config_obj);
-  }
-
-  return picojson::value(config).serialize(true);
+  config["debug_config"] = picojson::value(debug_config.AsJSON());
+  return config;
 }
 
 /****************** EngineConfig ******************/
@@ -349,11 +377,9 @@ struct ModelConfigLimits {
 
 /*! \brief Convert the bytes to megabytes, keeping 3 decimals. */
 inline std::string BytesToMegabytesString(double bytes) {
-  std::string str;
-  str.resize(20);
-  std::sprintf(&str[0], "%.3f", bytes / 1024 / 1024);
-  str.resize(std::strlen(str.c_str()));
-  return str;
+  std::ostringstream os;
+  os << std::setprecision(3) << std::fixed << (bytes / 1024 / 1024);
+  return os.str();
 }
 
 /*!
 
@@ -30,35 +30,53 @@ struct ResponseFormat {
   Optional<String> schema = NullOpt;
 };
 
+enum class SpecialRequestKind : int {
+  kNone = 0,
+  kQueryEngineMetrics = 1,
+};
+
 /*! \brief The debug configuration of a request. */
 class DebugConfig {
  public:
   bool ignore_eos = false;
   bool pinned_system_prompt = false;
+  SpecialRequestKind special_request = SpecialRequestKind::kNone;
+
+  /*!
+   * \brief Create debug config from JSON.
+   * \param config_json The json string for generation config
+   * \returns The converted result.
+   */
+  static Result<DebugConfig> FromJSON(const picojson::object& config_json);
+
+  /**
+   * \return serialized json value of the config.
+   */
+  picojson::object AsJSON() const;
 };
 
 /*! \brief The generation configuration of a request. */
 class GenerationConfigNode : public Object {
  public:
   int n = 1;
-  double temperature = 0.8;
-  double top_p = 0.95;
+  double temperature = 1.0;
+  double top_p = 1.0;
   double frequency_penalty = 0.0;
   double presence_penalty = 0.0;
   double repetition_penalty = 1.0;
   bool logprobs = false;
   int top_logprobs = 0;
   std::vector<std::pair<int, float>> logit_bias;
   int seed;
-
-  int max_tokens = 128;
+  // -1 means infinite
+  int max_tokens = -1;
   Array<String> stop_strs;
   std::vector<int> stop_token_ids;
 
   ResponseFormat response_format;
   DebugConfig debug_config;
 
-  String AsJSONString() const;
+  picojson::object AsJSON() const;
 
   static constexpr const char* _type_key = "mlc.serve.GenerationConfig";
   static constexpr const bool _type_has_method_sequal_reduce = false;
@@ -76,10 +94,10 @@ class GenerationConfig : public ObjectRef {
 
   /*!
    * \brief Create generation config from JSON.
-   * \param config_json_str The json string for generation config
+   * \param config_json The json string for generation config
    * \param default_config The default config
    */
-  static Result<GenerationConfig> FromJSON(String config_json_str,
+  static Result<GenerationConfig> FromJSON(const picojson::object& config_json,
                                            const GenerationConfig& default_config);
 
   /*! \brief Get the default generation config from the model config. */