feat(xtask): 增加max session控制处理并发

Ceng23333 · Ceng23333 · commit 783262e3734b · 2025-08-21T17:34:37.000+08:00
Signed-off-by: Ceng23333 &lt;441651826@qq.com&gt;
diff --git a/xtask/src/service/error.rs b/xtask/src/service/error.rs
@@ -8,6 +8,7 @@ pub(crate) enum Error {
     NotFound(NotFoundError),
     MsgNotSupported(MsgNotSupportedError),
     ModelNotFound(String),
+    TooManyConnections,
 }
 
 #[derive(Serialize, Debug)]
@@ -42,6 +43,7 @@ impl Error {
             Self::NotFound(..) => StatusCode::NOT_FOUND,
             Self::MsgNotSupported(..) => StatusCode::BAD_REQUEST,
             Self::ModelNotFound(..) => StatusCode::NOT_FOUND,
+            Self::TooManyConnections => StatusCode::TOO_MANY_REQUESTS,
         }
     }
 
@@ -52,6 +54,9 @@ impl Error {
             Self::NotFound(e) => serde_json::to_string(&e).unwrap(),
             Self::MsgNotSupported(e) => serde_json::to_string(&e).unwrap(),
             Self::ModelNotFound(model) => format!("Model not found: {model}"),
+            Self::TooManyConnections => {
+                "Too many concurrent connections. Please try again later.".to_string()
+            }
         }
     }
 }
@@ -63,6 +68,10 @@ impl fmt::Display for Error {
             Error::NotFound(e) => write!(f, "Not Found: {} {}", e.method, e.uri),
             Error::MsgNotSupported(e) => write!(f, "Message type not supported: {:?}", e.message),
             Error::ModelNotFound(model) => write!(f, "Model not found: {model}"),
+            Error::TooManyConnections => write!(
+                f,
+                "Too many concurrent connections. Please try again later."
+            ),
         }
     }
 }
diff --git a/xtask/src/service/mod.rs b/xtask/src/service/mod.rs
@@ -59,6 +59,8 @@ pub struct ServiceArgs {
     #[clap(long)]
     max_tokens: Option<usize>,
     #[clap(long)]
+    max_sessions: Option<usize>,
+    #[clap(long)]
     temperature: Option<f32>,
     #[clap(long)]
     top_p: Option<f32>,
@@ -74,6 +76,8 @@ pub struct ModelConfig {
     pub gpus: Option<Box<[c_int]>>,
     #[serde(rename = "max-tokens")]
     pub max_tokens: Option<usize>,
+    #[serde(rename = "max-sessions")]
+    pub max_sessions: Option<usize>,
     pub temperature: Option<f32>,
     #[serde(rename = "top-p")]
     pub top_p: Option<f32>,
@@ -92,6 +96,7 @@ impl ServiceArgs {
             name,
             gpus,
             max_tokens,
+            max_sessions,
             temperature,
             top_p,
             repetition_penalty,
@@ -109,6 +114,7 @@ impl ServiceArgs {
                     path: file.clone(),
                     gpus: Some(parse_gpus(gpus.as_deref())),
                     max_tokens,
+                    max_sessions,
                     temperature,
                     top_p,
                     repetition_penalty,
@@ -146,7 +152,7 @@ async fn start_infer_service(
     handles: Vec<(Arc<Model>, Service)>,
     port: u16,
 ) -> std::io::Result<()> {
-    let app = App(Arc::new(models));
+    let app = App::new(models);
 
     let _handles = handles
         .into_iter()
@@ -174,22 +180,50 @@ async fn start_infer_service(
 }
 
 #[derive(Clone)]
-struct App(Arc<HashMap<String, Arc<Model>>>);
+struct App(Arc<HashMap<String, Arc<Model>>>, Arc<AtomicUsize>);
+
+impl App {
+    fn new(models: HashMap<String, Arc<Model>>) -> Self {
+        App(Arc::new(models), Arc::new(AtomicUsize::new(0)))
+    }
+
+    fn try_acquire_connection(&self) -> bool {
+        const MAX_CONCURRENT_CONNECTIONS: usize = 32; // Set a reasonable limit
+        let current = self.1.fetch_add(1, SeqCst);
+        if current >= MAX_CONCURRENT_CONNECTIONS {
+            self.1.fetch_sub(1, SeqCst);
+            false
+        } else {
+            true
+        }
+    }
+
+    fn release_connection(&self) {
+        self.1.fetch_sub(1, SeqCst);
+    }
+}
 
 impl HyperService<Request<Incoming>> for App {
     type Response = Response<BoxBody<Bytes, hyper::Error>>;
     type Error = hyper::Error;
     type Future = Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send>>;
 
     fn call(&self, req: Request<Incoming>) -> Self::Future {
-        match (req.method(), req.uri().path()) {
-            openai::GET_MODELS => {
-                let json = json(create_models(self.0.keys().cloned()));
-                Box::pin(async move { Ok(json) })
-            }
-            openai::POST_COMPLETIONS => {
-                let models = self.0.clone();
-                Box::pin(async move {
+        // Try to acquire a connection slot
+        if !self.try_acquire_connection() {
+            let response = error(Error::TooManyConnections);
+            return Box::pin(async move { Ok(response) });
+        }
+
+        let app_clone = self.clone();
+        Box::pin(async move {
+            let result = match (req.method(), req.uri().path()) {
+                openai::GET_MODELS => {
+                    let json = json(create_models(app_clone.0.keys().cloned()));
+                    Ok(json)
+                }
+                openai::POST_COMPLETIONS => {
+                    let models = app_clone.0.clone();
                     let whole_body = req.collect().await?.to_bytes();
                     let req: CreateCompletionRequest = match serde_json::from_slice(&whole_body) {
                         Ok(req) => req,
@@ -261,11 +295,9 @@ impl HyperService<Request<Incoming>> for App {
 
                     let response = completion_response(id, created, model_name, content_, reason_);
                     Ok(json(response))
-                })
-            }
-            openai::POST_CHAT_COMPLETIONS => {
-                let models = self.0.clone();
-                Box::pin(async move {
+                }
+                openai::POST_CHAT_COMPLETIONS => {
+                    let models = app_clone.0.clone();
                     let whole_body = req.collect().await?.to_bytes();
 
                     let req: CreateChatCompletionRequest = match serde_json::from_slice(&whole_body)
@@ -352,14 +384,17 @@ impl HyperService<Request<Incoming>> for App {
                         reason_,
                     );
                     Ok(json(response))
-                })
-            }
-            // Return 404 Not Found for other routes.
-            (method, uri) => {
-                let msg = Error::not_found(method, uri);
-                Box::pin(async move { Ok(error(msg)) })
-            }
-        }
+                }
+                (method, uri) => {
+                    let msg = Error::not_found(method, uri);
+                    Ok(error(msg))
+                }
+            };
+
+            // Always release the connection when done
+            app_clone.release_connection();
+            result
+        })
     }
 }
 
diff --git a/xtask/src/service/model.rs b/xtask/src/service/model.rs
@@ -20,6 +20,7 @@ use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
 
 pub(super) struct Model {
     max_tokens: usize,
+    max_sessions: usize,
     sampling: SampleArgs,
     think: [utok; 2],
     enable_thinking: bool,
@@ -68,6 +69,7 @@ impl Model {
             path,
             gpus,
             max_tokens,
+            max_sessions,
             temperature,
             top_p,
             repetition_penalty,
@@ -99,6 +101,7 @@ impl Model {
 
         let model = Model {
             max_tokens: max_tokens.unwrap_or(2 << 10),
+            max_sessions: max_sessions.unwrap_or(16), // Default to 16 if not specified
             sampling: SampleArgs::new(
                 temperature.unwrap_or(0.),
                 top_p.unwrap_or(1.),
@@ -121,6 +124,15 @@ impl Model {
         (model, service)
     }
 
+    fn check_session_limit(&self) -> Result<(), Error> {
+        let sessions = self.sessions.lock().unwrap();
+        if sessions.len() >= self.max_sessions {
+            Err(Error::TooManyConnections)
+        } else {
+            Ok(())
+        }
+    }
+
     pub fn serve(&self, service: &mut Service) {
         let [think, _think] = self.think;
         loop {
@@ -263,6 +275,7 @@ impl Model {
         &self,
         req: CreateChatCompletionRequest,
     ) -> Result<UnboundedReceiver<Output>, Error> {
+        self.check_session_limit()?;
         let CreateChatCompletionRequest {
             messages,
             max_tokens,
@@ -347,6 +360,7 @@ impl Model {
         &self,
         req: CreateCompletionRequest,
     ) -> Result<UnboundedReceiver<Output>, Error> {
+        self.check_session_limit()?;
         let CreateCompletionRequest {
             prompt,
             max_tokens,

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@ pub(crate) enum Error {`
`8`	`8`	`NotFound(NotFoundError),`
`9`	`9`	`MsgNotSupported(MsgNotSupportedError),`
`10`	`10`	`ModelNotFound(String),`
	`11`	`+ TooManyConnections,`
`11`	`12`	`}`
`12`	`13`
`13`	`14`	`#[derive(Serialize, Debug)]`
`@@ -42,6 +43,7 @@ impl Error {`
`42`	`43`	`Self::NotFound(..) => StatusCode::NOT_FOUND,`
`43`	`44`	`Self::MsgNotSupported(..) => StatusCode::BAD_REQUEST,`
`44`	`45`	`Self::ModelNotFound(..) => StatusCode::NOT_FOUND,`
	`46`	`+ Self::TooManyConnections => StatusCode::TOO_MANY_REQUESTS,`
`45`	`47`	`}`
`46`	`48`	`}`
`47`	`49`
`@@ -52,6 +54,9 @@ impl Error {`
`52`	`54`	`Self::NotFound(e) => serde_json::to_string(&e).unwrap(),`
`53`	`55`	`Self::MsgNotSupported(e) => serde_json::to_string(&e).unwrap(),`
`54`	`56`	`Self::ModelNotFound(model) => format!("Model not found: {model}"),`
	`57`	`+ Self::TooManyConnections => {`
	`58`	`+ "Too many concurrent connections. Please try again later.".to_string()`
	`59`	`+ }`
`55`	`60`	`}`
`56`	`61`	`}`
`57`	`62`	`}`
`@@ -63,6 +68,10 @@ impl fmt::Display for Error {`
`63`	`68`	`Error::NotFound(e) => write!(f, "Not Found: {} {}", e.method, e.uri),`
`64`	`69`	`Error::MsgNotSupported(e) => write!(f, "Message type not supported: {:?}", e.message),`
`65`	`70`	`Error::ModelNotFound(model) => write!(f, "Model not found: {model}"),`
	`71`	`+ Error::TooManyConnections => write!(`
	`72`	`+ f,`
	`73`	`+ "Too many concurrent connections. Please try again later."`
	`74`	`+ ),`
`66`	`75`	`}`
`67`	`76`	`}`
`68`	`77`	`}`