mlc-ai
diff --git a/‎3rdparty/tvm‎ b/‎3rdparty/tvm‎
diff --git a/‎python/mlc_llm/bench/api_endpoint.py‎
Lines changed: 5 additions & 1 deletion b/‎python/mlc_llm/bench/api_endpoint.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎python/mlc_llm/bench/dataset.py‎
Lines changed: 13 additions & 3 deletions b/‎python/mlc_llm/bench/dataset.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎python/mlc_llm/bench/request_processor.py‎
Lines changed: 3 additions & 1 deletion b/‎python/mlc_llm/bench/request_processor.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎python/mlc_llm/bench/request_record.py‎
Lines changed: 10 additions & 2 deletions b/‎python/mlc_llm/bench/request_record.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎python/mlc_llm/cli/calibrate.py‎
Lines changed: 9 additions & 2 deletions b/‎python/mlc_llm/cli/calibrate.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎python/mlc_llm/cli/compile.py‎
Lines changed: 4 additions & 1 deletion b/‎python/mlc_llm/cli/compile.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎python/mlc_llm/cli/delivery.py‎
Lines changed: 16 additions & 3 deletions b/‎python/mlc_llm/cli/delivery.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎python/mlc_llm/cli/disco_remote_socket_session.py‎
Lines changed: 1 addition & 0 deletions b/‎python/mlc_llm/cli/disco_remote_socket_session.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/mlc_llm/cli/serve.py‎
Lines changed: 5 additions & 1 deletion b/‎python/mlc_llm/cli/serve.py‎
Lines changed: 5 additions & 1 deletion
@@ -451,7 +451,11 @@ def create_api_endpoint(args: argparse.Namespace) -> APIEndPoint:
         return OpenAIEndPoint(args.host, args.port, args.timeout, args.include_server_metrics)
     if args.api_endpoint == "vllm":
         return OpenAIEndPoint(
-            args.host, args.port, args.timeout, include_server_metrics=False, no_debug_config=True
+            args.host,
+            args.port,
+            args.timeout,
+            include_server_metrics=False,
+            no_debug_config=True,
         )
     if args.api_endpoint == "openai-chat":
         return OpenAIChatEndPoint(args.host, args.port, args.timeout, args.include_server_metrics)
 
@@ -723,8 +723,16 @@ def __init__(self, dataset_path: str, tokenizer: AutoTokenizer) -> None:
         self.dataset = [
             (
                 entry["TIMESTAMP"],
-                min(entry["ContextTokens"], tokenizer.model_max_length, self.truncate_length),
-                min(entry["GeneratedTokens"], tokenizer.model_max_length, self.truncate_length),
+                min(
+                    entry["ContextTokens"],
+                    tokenizer.model_max_length,
+                    self.truncate_length,
+                ),
+                min(
+                    entry["GeneratedTokens"],
+                    tokenizer.model_max_length,
+                    self.truncate_length,
+                ),
             )
             for _, entry in df.iterrows()
             if entry["ContextTokens"] >= 4 and entry["GeneratedTokens"] >= 4
@@ -836,7 +844,9 @@ def create_dataset(  # pylint: disable=too-many-return-statements,too-many-branc
             args.apply_chat_template is False
         ), "LLMPerf dataset does not support applying chat template"
         return LLMPerfDataset(
-            args.dataset_path, (args.num_requests + args.num_warmup_requests) * 4, tokenizer
+            args.dataset_path,
+            (args.num_requests + args.num_warmup_requests) * 4,
+            tokenizer,
         )
     if args.dataset == "json-mode-eval":
         assert (
 
@@ -601,7 +601,9 @@ async def _task(request_record: RequestRecord) -> None:
 
 
 def create_pipelines(  # pylint: disable=too-many-branches
-    args: argparse.Namespace, f_create_api_endpoint: Callable[[], APIEndPoint], dataset: Dataset
+    args: argparse.Namespace,
+    f_create_api_endpoint: Callable[[], APIEndPoint],
+    dataset: Dataset,
 ) -> List[RequestProcessor]:
     """Creating request processing pipelines with regard to the specified args."""
     cuda_profile_url = f"http://{args.host}:{args.port}" if args.cuda_profile else None
 
@@ -113,7 +113,9 @@ def generate_metrics_summary(
     return report
 
 
-def _compute_metrics_statistics(metrics: List[Union[Metrics, ServerMetrics]]) -> Dict[str, Any]:
+def _compute_metrics_statistics(
+    metrics: List[Union[Metrics, ServerMetrics]],
+) -> Dict[str, Any]:
     """
     Compute the statistics of the metrics.
 
@@ -133,7 +135,13 @@ def _compute_metrics_statistics(metrics: List[Union[Metrics, ServerMetrics]]) ->
     report: Dict = {}
     df = pd.DataFrame([metric.model_dump() for metric in metrics])
     for key, _ in metrics[0].model_fields.items():
-        if key in ["success", "start_time", "finish_time", "server_metrics", "exec_feature"]:
+        if key in [
+            "success",
+            "start_time",
+            "finish_time",
+            "server_metrics",
+            "exec_feature",
+        ]:
             continue
         if key in df.columns:
             series = df[key].dropna()
 
@@ -28,12 +28,19 @@ def main(argv):
         help=HELP["model_lib"] + ' (default: "%(default)s")',
     )
     parser.add_argument(
-        "--output", "-o", type=str, required=True, help=HELP["output_calibration"] + " (required)"
+        "--output",
+        "-o",
+        type=str,
+        required=True,
+        help=HELP["output_calibration"] + " (required)",
     )
     # Download dataset from
     # https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
     parser.add_argument(
-        "--dataset", type=str, required=True, help=HELP["calibration_dataset"] + " (required)"
+        "--dataset",
+        type=str,
+        required=True,
+        help=HELP["calibration_dataset"] + " (required)",
     )
 
     parser.add_argument(
 
@@ -121,7 +121,10 @@ def _check_system_lib_prefix(prefix: str) -> str:
     parsed.model_type = detect_model_type(parsed.model_type, parsed.model)
     parsed.quantization = detect_quantization(parsed.quantization, parsed.model)
     parsed.system_lib_prefix = detect_system_lib_prefix(
-        parsed.device, parsed.system_lib_prefix, parsed.model_type.name, parsed.quantization.name
+        parsed.device,
+        parsed.system_lib_prefix,
+        parsed.model_type.name,
+        parsed.quantization.name,
     )
     with open(parsed.model, "r", encoding="utf-8") as config_file:
         config = json.load(config_file)
 
@@ -172,7 +172,11 @@ def _run_quantization(
             ]
             print(" ".join(cmd), file=log_file, flush=True)
             subprocess.run(
-                cmd, check=False, stdout=log_file, stderr=subprocess.STDOUT, env=os.environ
+                cmd,
+                check=False,
+                stdout=log_file,
+                stderr=subprocess.STDOUT,
+                env=os.environ,
             )
         logger.info("[MLC] Complete!")
     if not (Path(output_dir) / "tensor-cache.json").exists() and not model_info.gen_config_only:
@@ -225,7 +229,13 @@ def _generate_model_delivery_diff(  # pylint: disable=too-many-locals
         quantization = task.quantization
         overrides = {**default_overrides, **task.overrides}
 
-        logger.info("Checking task: %s %s %s %s", model_id, conv_template, quantization, overrides)
+        logger.info(
+            "Checking task: %s %s %s %s",
+            model_id,
+            conv_template,
+            quantization,
+            overrides,
+        )
         log_tasks = [t for t in log.tasks if t.model_id == model_id]
         delivered_quantizations = set()
         gen_config_only = set()
@@ -260,7 +270,10 @@ def _generate_model_delivery_diff(  # pylint: disable=too-many-locals
     diff_config.default_overrides = {}
     diff_config.tasks = diff_tasks
 
-    logger.info("Model delivery diff: %s", diff_config.model_dump_json(indent=4, exclude_none=True))
+    logger.info(
+        "Model delivery diff: %s",
+        diff_config.model_dump_json(indent=4, exclude_none=True),
+    )
 
     return diff_config
 
 
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Internal remote disco socket session."""
+
 import sys
 
 from tvm import runtime as _  # pylint: disable=unused-import
 
@@ -53,7 +53,11 @@ def __repr__(self) -> str:
         print(f";sliding_window_size={self.sliding_window_size}", file=out, end="")
         print(f";attention_sink_size={self.attention_sink_size}", file=out, end="")
         print(f";tensor_parallel_shards={self.tensor_parallel_shards}", file=out, end="")
-        print(f";pipeline_parallel_stages={self.pipeline_parallel_stages}", file=out, end="")
+        print(
+            f";pipeline_parallel_stages={self.pipeline_parallel_stages}",
+            file=out,
+            end="",
+        )
         print(f";opt={self.opt}", file=out, end="")
         return out.getvalue().rstrip()