InfiniTensor
diff --git a/‎Cargo.lock‎
Lines changed: 34 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 15 additions & 3 deletions b/‎README.md‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎xtask/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎xtask/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎xtask/src/main.rs‎
Lines changed: 14 additions & 10 deletions b/‎xtask/src/main.rs‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎xtask/src/service/cache_manager.rs‎
Lines changed: 2 additions & 2 deletions b/‎xtask/src/service/cache_manager.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎xtask/src/service/client.rs‎
Lines changed: 8 additions & 3 deletions b/‎xtask/src/service/client.rs‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎xtask/src/service/error.rs‎
Lines changed: 18 additions & 1 deletion b/‎xtask/src/service/error.rs‎
Lines changed: 18 additions & 1 deletion
@@ -89,14 +89,26 @@ cargo service --help
 ```plaintext
 web service
 
-Usage: xtask service [OPTIONS] --port <PORT> <MODEL>
+Usage: xtask service [OPTIONS] --port <PORT> <FILE>
 
 Arguments:
-  <MODEL>
+  <FILE>
 
 Options:
+  -p, --port <PORT>
+      --no-cuda-graph
+      --name <NAME>
       --gpus <GPUS>
       --max-steps <MAX_STEPS>
-  -p, --port <PORT>
+      --think
   -h, --help
 ```
+
+通过 TOML 配置文件可以配置多模型服务。示例格式：
+
+```toml
+[model-name]
+path = "model-path"
+think = true
+max-steps = 2048
+```
@@ -17,6 +17,7 @@ ratatui = "0.29"
 
 serde.workspace = true
 serde_json = "1.0"
+toml = "0.8"
 tokio = { version = "1.45", features = ["rt-multi-thread", "net"] }
 hyper = { version = "1.6", features = ["http1", "server"] }
 hyper-util = { version = "0.1", features = ["http1", "tokio", "server"] }
 
@@ -55,23 +55,27 @@ struct BaseArgs {
 
 impl BaseArgs {
     fn gpus(&self) -> Box<[c_int]> {
-        self.gpus
-            .as_ref()
-            .map(|devices| {
-                static NUM_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+").unwrap());
-                NUM_REGEX
-                    .find_iter(devices)
-                    .map(|c| c.as_str().parse().unwrap())
-                    .collect()
-            })
-            .unwrap_or_else(|| [0].into())
+        parse_gpus(self.gpus.as_deref())
     }
 
     fn max_steps(&self) -> usize {
         self.max_steps.unwrap_or(1000)
     }
 }
 
+fn parse_gpus(config: Option<&str>) -> Box<[c_int]> {
+    config
+        .as_ref()
+        .map(|devices| {
+            static NUM_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+").unwrap());
+            NUM_REGEX
+                .find_iter(devices)
+                .map(|c| c.as_str().parse().unwrap())
+                .collect()
+        })
+        .unwrap_or_else(|| [0].into())
+}
+
 mod macros {
     macro_rules! print_now {
         ($($arg:tt)*) => {{
 
@@ -23,7 +23,7 @@ impl CacheManager {
         &mut self,
         tokens: Vec<utok>,
         sample_args: SampleArgs,
-        max_steps: usize,
+        max_tokens: usize,
     ) -> (SessionId, Vec<utok>) {
         static SESSION_ID: AtomicUsize = AtomicUsize::new(0);
         let id = SessionId(SESSION_ID.fetch_add(1, SeqCst));
@@ -51,7 +51,7 @@ impl CacheManager {
                 cache,
             },
             &tokens[pos..],
-            max_steps,
+            max_tokens,
         );
         (id, tokens)
     }
 
@@ -1,6 +1,8 @@
-use super::*;
+use super::openai::POST_CHAT_COMPLETIONS;
 use log::{info, trace, warn};
-use openai_struct::CreateChatCompletionStreamResponse;
+use openai_struct::{
+    ChatCompletionRequestMessage, CreateChatCompletionRequest, CreateChatCompletionStreamResponse,
+};
 use reqwest::header::{CONTENT_TYPE, HeaderMap, HeaderValue};
 use std::{env::VarError, time::Instant};
 use tokio::time::Duration;
@@ -72,7 +74,10 @@ async fn send_single_request(
     }
 
     let req = client
-        .post(format!("http://localhost:{port}{V1_CHAT_COMPLETIONS}"))
+        .post(format!(
+            "http://localhost:{port}{}",
+            POST_CHAT_COMPLETIONS.1
+        ))
         .headers(headers.clone())
         .body(req_body)
         .timeout(Duration::from_secs(100));
 
@@ -1,11 +1,13 @@
-use hyper::{Method, StatusCode};
+use hyper::{Method, StatusCode};
 use serde::Serialize;
+use std::fmt;
 
 #[derive(Debug)]
 pub(crate) enum Error {
     WrongJson(serde_json::Error),
     NotFound(NotFoundError),
     MsgNotSupported(MsgNotSupportedError),
+    ModelNotFound(String),
 }
 
 #[derive(Serialize, Debug)]
@@ -39,6 +41,7 @@ impl Error {
             Self::WrongJson(..) => StatusCode::BAD_REQUEST,
             Self::NotFound(..) => StatusCode::NOT_FOUND,
             Self::MsgNotSupported(..) => StatusCode::BAD_REQUEST,
+            Self::ModelNotFound(..) => StatusCode::NOT_FOUND,
         }
     }
 
@@ -48,6 +51,20 @@ impl Error {
             Self::WrongJson(e) => e.to_string(),
             Self::NotFound(e) => serde_json::to_string(&e).unwrap(),
             Self::MsgNotSupported(e) => serde_json::to_string(&e).unwrap(),
+            Self::ModelNotFound(model) => format!("Model not found: {}", model),
         }
     }
 }
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Error::WrongJson(e) => write!(f, "Invalid JSON: {}", e),
+            Error::NotFound(e) => write!(f, "Not Found: {} {}", e.method, e.uri),
+            Error::MsgNotSupported(e) => write!(f, "Message type not supported: {:?}", e.message),
+            Error::ModelNotFound(model) => write!(f, "Model not found: {}", model),
+        }
+    }
+}
+
+impl std::error::Error for Error {}