mlc-ai
diff --git a/‎cpp/serve/engine.cc‎
Lines changed: 47 additions & 20 deletions b/‎cpp/serve/engine.cc‎
Lines changed: 47 additions & 20 deletions
diff --git a/‎cpp/serve/function_table.cc‎
Lines changed: 4 additions & 10 deletions b/‎cpp/serve/function_table.cc‎
Lines changed: 4 additions & 10 deletions
diff --git a/‎cpp/serve/function_table.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/serve/function_table.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/serve/model.cc‎
Lines changed: 6 additions & 4 deletions b/‎cpp/serve/model.cc‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎cpp/serve/model.h‎
Lines changed: 2 additions & 1 deletion b/‎cpp/serve/model.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/deploy/cli.rst‎
Lines changed: 51 additions & 64 deletions b/‎docs/deploy/cli.rst‎
Lines changed: 51 additions & 64 deletions
@@ -7,6 +7,7 @@
 
 #include <dlpack/dlpack.h>
 #include <tvm/runtime/logging.h>
+#include <tvm/runtime/memory/memory_manager.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
@@ -21,6 +22,7 @@
 #include "../grammar/grammar_state_matcher.h"
 #include "../support/json_parser.h"
 #include "../support/result.h"
+#include "../support/utils.h"
 #include "../tokenizers/tokenizers.h"
 #include "engine_actions/action.h"
 #include "engine_actions/action_commons.h"
@@ -278,22 +280,27 @@ class EngineImpl : public Engine {
     std::vector<std::pair<std::string, std::string>> models_and_model_libs =
         models_and_model_libs_res.Unwrap();
 
-    ICHECK_GE(models_and_model_libs.size(), 1);
+    int num_model = models_and_model_libs.size();
+    ICHECK_GE(num_model, 1);
     // - Initialize singleton states inside the engine.
     n->estate_->Reset();
     n->request_stream_callback_ = std::move(request_stream_callback);
     n->trace_recorder_ = trace_recorder;
     n->device_ = device;
     // - Load model config, create a shared disco session when tensor
     // parallelism is enabled.
+    std::vector<std::string> model_libs;
     std::vector<picojson::object> model_configs;
-    for (int i = 0; i < static_cast<int>(models_and_model_libs.size()); ++i) {
+    model_libs.reserve(num_model);
+    model_configs.reserve(num_model);
+    for (int i = 0; i < num_model; ++i) {
       const auto& [model_str, model_lib] = models_and_model_libs[i];
       Result<picojson::object> model_config_res = Model::LoadModelConfig(model_str);
       if (model_config_res.IsErr()) {
         return TResult::Error("Model " + std::to_string(i) +
                               " has invalid mlc-chat-config.json: " + model_config_res.UnwrapErr());
       }
+      model_libs.push_back(model_lib);
       model_configs.push_back(model_config_res.Unwrap());
     }
 
@@ -303,13 +310,14 @@ class EngineImpl : public Engine {
                                         model_configs[0]);
     }
 
-    Optional<Session> session = n->CreateDiscoSession(model_configs, device);
+    auto [session, num_shards] = n->CreateDiscoSession(model_libs, model_configs, device);
     // - Initialize each model independently.
     n->models_.clear();
-    for (int i = 0; i < static_cast<int>(models_and_model_libs.size()); ++i) {
+    for (int i = 0; i < num_model; ++i) {
       const auto& [model_str, model_lib] = models_and_model_libs[i];
-      Model model = Model::Create(model_lib, model_str, model_configs[i], device, session,
-                                  /*trace_enabled=*/trace_recorder.defined());
+      Model model =
+          Model::Create(model_lib, model_str, model_configs[i], device, session, num_shards,
+                        /*trace_enabled=*/trace_recorder.defined());
       n->models_.push_back(model);
     }
     // - Automatically infer the missing fields in EngineConfig JSON strings
@@ -622,25 +630,44 @@ class EngineImpl : public Engine {
   }
 
   /************** Utility Functions **************/
-  Optional<Session> CreateDiscoSession(const std::vector<picojson::object>& model_configs,
-                                       Device device) {
+  std::pair<Optional<Session>, int> CreateDiscoSession(
+      const std::vector<std::string>& model_libs,
+      const std::vector<picojson::object>& model_configs, Device device) {
     const auto& base_model_config = model_configs[0];
 
-    auto f_get_num_shards = [](const picojson::object& model_config) -> int {
-      constexpr auto kNumShardsKey = "tensor_parallel_shards";
-      if (model_config.count(kNumShardsKey)) {
-        const auto& val = model_config.at(kNumShardsKey);
-        CHECK(val.is<int64_t>());
-        return static_cast<int>(val.get<int64_t>());
+    auto f_get_num_shards = [&device](const std::string& model_lib,
+                                      const picojson::object& model_config) -> int {
+      if (!StartsWith(model_lib, "system://")) {
+        Module executable = tvm::runtime::Module::LoadFromFile(model_lib);
+        PackedFunc fload_exec = executable->GetFunction("vm_load_executable");
+        ICHECK(fload_exec.defined()) << "TVM runtime cannot find vm_load_executable";
+        Module local_vm = fload_exec();
+        local_vm->GetFunction("vm_initialization")(
+            static_cast<int>(device.device_type), device.device_id,
+            static_cast<int>(tvm::runtime::memory::AllocatorType::kPooled),
+            static_cast<int>(kDLCPU), 0,
+            static_cast<int>(tvm::runtime::memory::AllocatorType::kPooled));
+        return ModelMetadata::FromModule(local_vm, std::move(model_config)).tensor_parallel_shards;
       } else {
-        LOG(FATAL) << "Key \"tensor_parallel_shards\" not found.";
+        return 1;
       }
-      throw;
     };
 
-    int num_shards = std::transform_reduce(
-        model_configs.begin(), model_configs.end(), 1, [](int a, int b) { return std::max(a, b); },
-        f_get_num_shards);
+    int num_shards = -1;
+    ICHECK_EQ(model_libs.size(), model_configs.size());
+    for (int i = 0; i < static_cast<int>(model_libs.size()); ++i) {
+      int model_num_shards = f_get_num_shards(model_libs[i], model_configs[i]);
+      if (i == 0) {
+        num_shards = model_num_shards;
+      } else {
+        CHECK_EQ(model_num_shards, num_shards)
+            << "Inconsistent tensor_parallel_shards values across models. Some model is compiled "
+               "with tensor_parallel_shards "
+            << num_shards << " and some other model is compiled with tensor_parallel_shards "
+            << model_num_shards;
+      }
+    }
+
     Optional<Session> session = NullOpt;
     if (num_shards > 1) {
       constexpr const char* f_create_process_pool = "runtime.disco.create_process_pool";
@@ -664,7 +691,7 @@ class EngineImpl : public Engine {
       session = Session::ProcessSession(num_shards, f_create_process_pool, "mlc_llm.cli.worker");
       session.value()->InitCCL(ccl, ShapeTuple(device_ids));
     }
-    return session;
+    return {session, num_shards};
   }
 
   /************** Debug/Profile **************/
 
@@ -70,22 +70,14 @@ PackedFunc FunctionTable::SessionFuncAsPackedFunc(Session sess, DRef sess_func,
 }
 
 void FunctionTable::Init(String reload_lib_path, Device device, picojson::object model_config,
-                         Optional<Session> session) {
+                         Optional<Session> session, int num_shards) {
   local_gpu_device = device;
   Device null_device{DLDeviceType(0), 0};
-  int num_shards;
-  {
-    if (model_config.count("tensor_parallel_shards")) {
-      CHECK(model_config["tensor_parallel_shards"].is<int64_t>());
-      num_shards = model_config["tensor_parallel_shards"].get<int64_t>();
-    } else {
-      num_shards = 1;
-    }
-  }
   this->model_config = model_config;
   this->cached_buffers = Map<String, ObjectRef>();
 
   if (num_shards > 1) {
+    ICHECK(session.defined());
     this->sess = session.value();
     this->use_disco = true;
     this->disco_mod = sess->CallPacked(sess->GetGlobalFunc("runtime.disco.load_vm_module"),
@@ -111,6 +103,7 @@ void FunctionTable::Init(String reload_lib_path, Device device, picojson::object
         ModelMetadata::FromModule(this->disco_mod->DebugGetFromRemote(0), std::move(model_config));
     this->_InitFunctions();
   } else {
+    ICHECK(!session.defined());
     Module executable{nullptr};
     PackedFunc fload_exec{nullptr};
     if (StartsWith(reload_lib_path, "system://")) {
@@ -145,6 +138,7 @@ void FunctionTable::Init(String reload_lib_path, Device device, picojson::object
     this->model_metadata_ = ModelMetadata::FromModule(this->local_vm, std::move(model_config));
     this->_InitFunctions();
   }
+  ICHECK_EQ(this->model_metadata_.tensor_parallel_shards, num_shards);
 }
 
 ObjectRef FunctionTable::LoadParams(const std::string& model_path, Device device) {
 
@@ -42,7 +42,7 @@ struct FunctionTable {
   static PackedFunc SessionFuncAsPackedFunc(Session sess, DRef sess_func, String name);
 
   void Init(String reload_lib_path, Device device, picojson::object model_config,
-            Optional<Session> session);
+            Optional<Session> session, int num_shards);
 
   ObjectRef LoadParams(const std::string& model_path, Device device);
 
 
@@ -27,9 +27,10 @@ class ModelImpl;
 TVM_REGISTER_OBJECT_TYPE(ModelObj);
 
 Model Model::Create(String reload_lib_path, String model_path, const picojson::object& model_config,
-                    DLDevice device, const Optional<Session>& session, bool trace_enabled) {
+                    DLDevice device, const Optional<Session>& session, int num_shards,
+                    bool trace_enabled) {
   return Model(make_object<ModelImpl>(reload_lib_path, model_path, model_config, device, session,
-                                      trace_enabled));
+                                      num_shards, trace_enabled));
 }
 
 Result<picojson::object> Model::LoadModelConfig(const String& model_path) {
@@ -56,14 +57,15 @@ class ModelImpl : public ModelObj {
    * \sa Model::Create
    */
   explicit ModelImpl(String reload_lib_path, String model_path, picojson::object model_config,
-                     DLDevice device, const Optional<Session>& session, bool trace_enabled)
+                     DLDevice device, const Optional<Session>& session, int num_shards,
+                     bool trace_enabled)
       : model_(model_path), device_(device) {
     // Step 1. Process model config json string.
     LoadModelConfigJSON(model_config);
     // Step 2. Initialize vm, we use the packed function mechanism
     // so there is no explicit abi dependency on these extra
     // classes other than basic tvm runtime.
-    this->ft_.Init(reload_lib_path, device_, model_config, session);
+    this->ft_.Init(reload_lib_path, device_, model_config, session, num_shards);
     // Step 3. Reset
     this->Reset();
     // Step 4. Set model type
 
@@ -368,12 +368,13 @@ class Model : public ObjectRef {
    * \param model_config The model config json object.
    * \param device The device to run the model on.
    * \param session The session to run the model on.
+   * \param num_shards The number of tensor parallel shards of the model.
    * \param trace_enabled A boolean indicating whether tracing is enabled.
    * \return The created runtime module.
    */
   static Model Create(String reload_lib_path, String model_path,
                       const picojson::object& model_config, DLDevice device,
-                      const Optional<Session>& session, bool trace_enabled);
+                      const Optional<Session>& session, int num_shards, bool trace_enabled);
 
   /*!
    * Load the model config from the given model path.
 
@@ -3,102 +3,89 @@
 CLI
 ===============
 
-MLCChat CLI is the command line tool to run MLC-compiled LLMs out of the box.
+MLC Chat CLI is the command line tool to run MLC-compiled LLMs out of the box interactively.
 
 .. contents:: Table of Contents
   :local:
   :depth: 2
 
-Option 1. Conda Prebuilt
-~~~~~~~~~~~~~~~~~~~~~~~~
+Install MLC-LLM Package
+------------------------
 
-The prebuilt package supports Metal on macOS and Vulkan on Linux and Windows, and can be installed via Conda one-liner.
+Chat CLI is a part of the MLC-LLM package.
+To use the chat CLI, first install MLC LLM by following the instructions :ref:`here <install-mlc-packages>`.
+Once you have install the MLC-LLM package, you can run the following command to check if the installation was successful:
 
-To use other GPU runtimes, e.g. CUDA, please instead :ref:`build it from source <mlcchat_build_from_source>`.
+.. code:: bash
 
-.. code:: shell
+   mlc_llm chat --help
 
-    conda activate your-environment
-    python3 -m pip install --pre -U -f https://mlc.ai/wheels mlc-llm-nightly mlc-ai-nightly
-    mlc_llm chat -h
+You should see serve help message if the installation was successful.
 
-.. note::
-    The prebuilt package supports **Metal** on macOS and **Vulkan** on Linux and Windows. It is possible to use other GPU runtimes such as **CUDA** by compiling MLCChat CLI from the source.
-
-
-Option 2. Build MLC Runtime from Source
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-We also provide options to build mlc runtime libraries and ``mlc_llm`` from source.
-This step is useful if the prebuilt is unavailable on your platform, or if you would like to build a runtime
-that supports other GPU runtime than the prebuilt version. We can build a customized version
-of mlc chat runtime. You only need to do this if you choose not to use the prebuilt.
-
-First, make sure you install TVM unity (following the instruction in :ref:`install-tvm-unity`).
-Then please follow the instructions in :ref:`mlcchat_build_from_source` to build the necessary libraries.
-
-.. `|` adds a blank line
-
-|
+Quick Start
+------------
 
-Run Models through MLCChat CLI
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This section provides a quick start guide to work with MLC-LLM chat CLI.
+To launch the CLI session, run the following command:
 
-Once ``mlc_llm`` is installed, you are able to run any MLC-compiled model on the command line.
+.. code:: bash
 
-To run a model with MLC LLM in any platform, you can either:
+   mlc_llm chat MODEL [--model-lib PATH-TO-MODEL-LIB]
 
-- Use off-the-shelf model prebuilts from the MLC Huggingface repo (see :ref:`Model Prebuilts` for details).
-- Use locally compiled model weights and libraries following :doc:`the model compilation page </compilation/compile_models>`.
+where ``MODEL`` is the model folder after compiling with :ref:`MLC-LLM build process <compile-model-libraries>`. Information about other arguments can be found in the next section.
 
-**Option 1: Use model prebuilts**
-
-To run ``mlc_llm``, you can specify the Huggingface MLC prebuilt model repo path with the prefix ``HF://``.
-For example, to run the MLC Llama 3 8B Q4F16_1 model (`Repo link <https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC>`_),
-simply use ``HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC``. The model weights and library will be downloaded
-automatically from Huggingface.
-
-.. code:: shell
-
-  mlc_llm chat HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC --device "cuda:0" --overrides context_window_size=1024
+Once the chat CLI is ready, you can enter the prompt to interact with the model.
 
 .. code::
 
   You can use the following special commands:
     /help               print the special commands
     /exit               quit the cli
-    /stats              print out the latest stats (token/sec)
+    /stats              print out stats of last request (token/sec)
+    /metrics            print out full engine metrics
     /reset              restart a fresh chat
     /set [overrides]    override settings in the generation config. For example,
-                        `/set temperature=0.5;max_gen_len=100;stop=end,stop`
+                        `/set temperature=0.5;top_p=0.8;seed=23;max_tokens=100;stop=str1,str2`
                         Note: Separate stop words in the `stop` option with commas (,).
     Multi-line input: Use escape+enter to start a new line.
 
-  user: What's the meaning of life
-  assistant:
-  What a profound and intriguing question! While there's no one definitive answer, I'd be happy to help you explore some perspectives on the meaning of life.
+  >>> What's the meaning of life?
+  The meaning of life is a philosophical and metaphysical question related to the purpose or significance of life or existence in general...
+
+.. note::
+
+  If you want to enable tensor parallelism to run LLMs on multiple GPUs,
+  please specify argument ``--overrides "tensor_parallel_shards=$NGPU"``.
+  For example,
+
+  .. code:: shell
 
-  The concept of the meaning of life has been debated and...
+    mlc_llm chat HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC --overrides "tensor_parallel_shards=2"
 
 
-**Option 2: Use locally compiled model weights and libraries**
+The ``mlc_llm chat`` Command
+----------------------------
 
-For models other than the prebuilt ones we provided:
+We provide the list of chat CLI interface for reference.
 
-1. If the model is a variant to an existing model library (e.g. ``WizardMathV1.1`` and ``OpenHermes`` are variants of ``Mistral``),
-   follow :ref:`convert-weights-via-MLC` to convert the weights and reuse existing model libraries.
-2. Otherwise, follow :ref:`compile-model-libraries` to compile both the model library and weights.
+.. code:: bash
 
-Once you have the model locally compiled with a model library and model weights, to run ``mlc_llm``, simply
+   mlc_llm serve MODEL [--model-lib PATH-TO-MODEL-LIB] [--device DEVICE] [--overrides OVERRIDES]
 
-- Specify the path to ``mlc-chat-config.json`` and the converted model weights to ``--model``
-- Specify the path to the compiled model library (e.g. a .so file) to ``--model-lib``
 
-.. code:: shell
+MODEL                  The model folder after compiling with MLC-LLM build process. The parameter
+                       can either be the model name with its quantization scheme
+                       (e.g. ``Llama-2-7b-chat-hf-q4f16_1``), or a full path to the model
+                       folder. In the former case, we will use the provided name to search
+                       for the model folder over possible paths.
 
-  mlc_llm chat dist/Llama-2-7b-chat-hf-q4f16_1-MLC \
-               --device "cuda:0" --overrides context_window_size=1024 \
-               --model-lib dist/prebuilt_libs/Llama-2-7b-chat-hf/Llama-2-7b-chat-hf-q4f16_1-vulkan.so
-               # CUDA on Linux: dist/prebuilt_libs/Llama-2-7b-chat-hf/Llama-2-7b-chat-hf-q4f16_1-cuda.so
-               # Metal on macOS: dist/prebuilt_libs/Llama-2-7b-chat-hf/Llama-2-7b-chat-hf-q4f16_1-metal.so
-               # Same rule applies for other platforms
+--model-lib            A field to specify the full path to the model library file to use (e.g. a ``.so`` file).
+--device               The description of the device to run on. User should provide a string in the
+                       form of ``device_name:device_id`` or ``device_name``, where ``device_name`` is one of
+                       ``cuda``, ``metal``, ``vulkan``, ``rocm``, ``opencl``, ``auto`` (automatically detect the
+                       local device), and ``device_id`` is the device id to run on. The default value is ``auto``,
+                       with the device id set to 0 for default.
+--overrides            Model configuration override. Supports overriding
+                       ``context_window_size``, ``prefill_chunk_size``, ``sliding_window_size``, ``attention_sink_size``,
+                       ``max_batch_size`` and ``tensor_parallel_shards``. The overrides could be explicitly
+                       specified via details knobs, e.g. --overrides ``context_window_size=1024;prefill_chunk_size=128``.