Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions crates/hamgrd/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ async fn main() {

let runtime_data = RuntimeData::new(args.slot_id, swbus_config.npu_ipv4, swbus_config.npu_ipv6);

// Setup swbus and actor runtime
// Setup swbus and actor runtime to the first endpoint of swbusd
let mut swbus_edge = SwbusEdgeRuntime::new(
format!("http://{}", swbus_config.endpoint),
format!("http://{}", swbus_config.endpoints.first().unwrap()),
swbus_sp.clone(),
ConnectionType::InNode,
);
Expand Down
7 changes: 5 additions & 2 deletions crates/swbus-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ async fn main() {
sp.service_type = "swbus-cli".to_string();
sp.service_id = Uuid::new_v4().to_string();
let mut runtime = SwbusEdgeRuntime::new(
format!("http://{}", swbus_config.endpoint),
format!("http://{}", swbus_config.endpoints.first().unwrap()),
sp.clone(),
ConnectionType::Client,
);
Expand Down Expand Up @@ -269,7 +269,10 @@ mod tests {

std::env::set_var("DEV", format!("dpu{slot}"));
let config = get_swbus_config(None).unwrap();
assert_eq!(config.endpoint.to_string(), format!("{}:{}", "10.0.1.0", 23606 + slot));
assert_eq!(
config.endpoints.first().unwrap().to_string(),
format!("{}:{}", "10.0.1.0", 23606 + slot)
);
let expected_sp = ServicePath::with_node("region-a", "cluster-a", &format!("host1-dpu{slot}"), "", "", "", "");
assert!(config
.routes
Expand Down
62 changes: 40 additions & 22 deletions crates/swbus-config/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const CONFIG_DB: &str = "CONFIG_DB";

#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
pub struct SwbusConfig {
pub endpoint: SocketAddr,
pub endpoints: Vec<SocketAddr>,
pub routes: Vec<RouteConfig>,
pub peers: Vec<PeerConfig>,
pub npu_ipv4: Option<Ipv4Addr>,
Expand Down Expand Up @@ -174,7 +174,6 @@ fn peer_config_from_dpu_entry(
"swbusd_port is not found in dpu {key} is not found"
)))?;

// dual stack is not supported. Either all ipv4 or all ipv6.
if let Some(npu_ipv4) = dpu_entry.npu_ipv4 {
let npu_ipv4 = npu_ipv4
.parse::<Ipv4Addr>()
Expand All @@ -184,7 +183,9 @@ fn peer_config_from_dpu_entry(
endpoint: SocketAddr::new(IpAddr::V4(npu_ipv4), swbusd_port),
conn_type: ConnectionType::InCluster,
});
} else if let Some(npu_ipv6) = dpu_entry.npu_ipv6 {
}

if let Some(npu_ipv6) = dpu_entry.npu_ipv6 {
let npu_ipv6 = npu_ipv6
.parse::<Ipv6Addr>()
.map_err(|_| SwbusConfigError::InvalidConfig(format!("Invalid IPv6 address: {npu_ipv6}")))?;
Expand Down Expand Up @@ -272,7 +273,7 @@ fn get_loopback_address(lb_index: u32) -> Result<(Option<Ipv4Addr>, Option<Ipv6A
pub fn swbus_config_from_db(dpu_id: u32) -> Result<SwbusConfig> {
let mut peers = Vec::new();
let mut myroutes: Option<Vec<RouteConfig>> = None;
let mut myendpoint: Option<SocketAddr> = None;
let mut myendpoints: Vec<SocketAddr> = Vec::new();

let (region, cluster, hostname) = get_device_info()?;

Expand Down Expand Up @@ -303,9 +304,10 @@ pub fn swbus_config_from_db(dpu_id: u32) -> Result<SwbusConfig> {
myroutes = Some(route_config_from_dpu_entry(&dpu, &region, &cluster, &hostname));

if let Some(npu_ipv4) = dpu.npu_ipv4 {
myendpoint = Some(SocketAddr::new(std::net::IpAddr::V4(npu_ipv4), swbusd_port));
} else if let Some(npu_ipv6) = dpu.npu_ipv6 {
myendpoint = Some(SocketAddr::new(std::net::IpAddr::V6(npu_ipv6), swbusd_port));
myendpoints.push(SocketAddr::new(std::net::IpAddr::V4(npu_ipv4), swbusd_port));
}
if let Some(npu_ipv6) = dpu.npu_ipv6 {
myendpoints.push(SocketAddr::new(std::net::IpAddr::V6(npu_ipv6), swbusd_port));
}
continue;
}
Expand Down Expand Up @@ -344,7 +346,7 @@ pub fn swbus_config_from_db(dpu_id: u32) -> Result<SwbusConfig> {
info!("successfully load swbus config from configdb for dpu {}", dpu_id);

Ok(SwbusConfig {
endpoint: myendpoint.unwrap(),
endpoints: myendpoints,
routes: myroutes.unwrap(),
peers,
npu_ipv4: my_ipv4,
Expand All @@ -359,14 +361,17 @@ pub fn swbus_config_from_yaml(yaml_file: &str) -> Result<SwbusConfig> {
// Parse the YAML data
let mut swbus_config: SwbusConfig = serde_yaml::from_reader(reader)
.map_err(|e| SwbusConfigError::InvalidConfig(format!("Failed to parse YAML file: {e}")))?;
let ip = swbus_config.endpoint.ip();

match ip {
IpAddr::V4(ipv4) => {
swbus_config.npu_ipv4 = Some(ipv4);
}
IpAddr::V6(ipv6) => {
swbus_config.npu_ipv6 = Some(ipv6);
let ips = swbus_config.endpoints.iter().map(|addr| addr.ip()).collect::<Vec<_>>();

for ip in ips {
match ip {
IpAddr::V4(ipv4) => {
swbus_config.npu_ipv4 = Some(ipv4);
}
IpAddr::V6(ipv6) => {
swbus_config.npu_ipv6 = Some(ipv6);
}
}
}

Expand Down Expand Up @@ -474,25 +479,35 @@ mod tests {
let mut config_fromdb = swbus_config_from_db(0).unwrap();

assert_eq!(config_fromdb.routes.len(), 1);
assert_eq!(config_fromdb.peers.len(), 5);
assert_eq!(config_fromdb.peers.len(), 10);

// create equivalent config in yaml
let yaml_content = r#"
endpoint: "10.0.1.0:23606"
endpoints: ["10.0.1.0:23606", "[2001:db8:1::]:23606"]
routes:
- key: "region-a.cluster-a.host1-dpu0"
scope: "InCluster"
peers:
- endpoint: "10.0.1.0:23607"
conn_type: "InCluster"
- endpoint: "[2001:db8:1::]:23607"
conn_type: "InCluster"
- endpoint: "10.0.1.1:23606"
conn_type: "InCluster"
- endpoint: "[2001:db8:1::1]:23606"
conn_type: "InCluster"
- endpoint: "10.0.1.1:23607"
conn_type: "InCluster"
- endpoint: "[2001:db8:1::1]:23607"
conn_type: "InCluster"
- endpoint: "10.0.1.2:23606"
conn_type: "InCluster"
- endpoint: "[2001:db8:1::2]:23606"
conn_type: "InCluster"
- endpoint: "10.0.1.2:23607"
conn_type: "InCluster"
- endpoint: "[2001:db8:1::2]:23607"
conn_type: "InCluster"
"#;

let dir = tempdir().unwrap();
Expand All @@ -505,9 +520,10 @@ mod tests {
expected.npu_ipv6 = Some(Ipv6Addr::from_str("2001:db8:1::").unwrap());
// sort before compare
config_fromdb.routes.sort_by(|a, b| a.key.cmp(&b.key));
config_fromdb.peers.sort_by(|a, b| a.endpoint.cmp(&b.endpoint));
config_fromdb.peers.sort_by_key(|p| p.endpoint.to_string());
expected.routes.sort_by(|a, b| a.key.cmp(&b.key));
expected.peers.sort_by(|a, b| a.endpoint.cmp(&b.endpoint));
expected.peers.sort_by_key(|p| p.endpoint.to_string());

assert_eq!(config_fromdb, expected);

cleanup_configdb_for_test();
Expand All @@ -516,14 +532,16 @@ mod tests {
#[test]
fn test_load_from_yaml() {
let yaml_content = r#"
endpoint: 10.0.0.1:8000
endpoints: ["10.0.0.1:8000"]
routes:
- key: "region-a.cluster-a.10.0.0.1-dpu0"
scope: "InCluster"
peers:
- endpoint: "10.0.0.2:8000"
- id: "region-a.cluster-a.10.0.0.2-dpu0"
endpoint: "10.0.0.2:8000"
conn_type: "InCluster"
- endpoint: "10.0.0.3:8000"
- id: "region-a.cluster-a.10.0.0.3-dpu0"
endpoint: "10.0.0.3:8000"
conn_type: "InCluster"
"#;

Expand Down
2 changes: 2 additions & 0 deletions crates/swbus-core/src/mux/multiplexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,8 @@ impl SwbusMultiplexer {

// get old routes, or create an empty one, and hold lock on the entry
let mut old_routes = self.routes_by_conn.entry(conn_info.clone()).or_default();
debug!("Old routes from conn {:?}: {:?}", conn_info.id(), *old_routes);
debug!("New routes from conn {:?}: {:?}", conn_info.id(), new_routes);
let routes_to_remove: BTreeSet<RouteEntry> = old_routes.difference(&new_routes).cloned().collect();
let routes_to_add: BTreeSet<RouteEntry> = new_routes.difference(&old_routes).cloned().collect();

Expand Down
121 changes: 73 additions & 48 deletions crates/swbus-core/src/mux/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,52 +11,57 @@ use swbus_config::SwbusConfig;
use swbus_proto::result::*;
use swbus_proto::swbus::swbus_service_server::{SwbusService, SwbusServiceServer};
use swbus_proto::swbus::*;
use tokio::sync::{
mpsc,
oneshot::{self, Receiver, Sender},
};
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tokio_stream::Stream;
use tokio_util::sync::CancellationToken;
use tonic::{transport::Server, Request, Response, Status, Streaming};
use tracing::*;

pub struct SwbusServiceHost {
swbus_server_addr: SocketAddr,
swbus_server_addrs: Vec<SocketAddr>,
mux: Option<Arc<SwbusMultiplexer>>,
conn_store: Option<Arc<SwbusConnStore>>,
shutdown_tx: Option<Sender<()>>,
shutdown_rx: Option<Receiver<()>>,
shutdown_ct: CancellationToken,
}

type SwbusMessageResult<T> = Result<Response<T>, Status>;
type SwbusMessageStream = Pin<Box<dyn Stream<Item = Result<SwbusMessage, Status>> + Send>>;

// Separate implementation struct to allow cloning for multiple servers
struct SwbusServiceImpl {
mux: Arc<SwbusMultiplexer>,
conn_store: Arc<SwbusConnStore>,
}

impl SwbusServiceHost {
pub fn new(swbus_server_addr: &SocketAddr) -> Self {
let (shutdown_tx, shutdown_rx) = oneshot::channel::<()>();
pub fn new(swbus_server_addrs: Vec<SocketAddr>) -> Self {
Self {
swbus_server_addr: *swbus_server_addr,
swbus_server_addrs,
mux: None,
conn_store: None,
shutdown_tx: Some(shutdown_tx),
shutdown_rx: Some(shutdown_rx),
shutdown_ct: CancellationToken::new(),
}
}

pub fn take_shutdown_sender(&mut self) -> Option<Sender<()>> {
self.shutdown_tx.take()
pub fn get_shutdown_token(&self) -> CancellationToken {
self.shutdown_ct.clone()
}

pub async fn shutdown(&mut self) {
if let Some(shutdown_tx) = self.shutdown_tx.take() {
let _ = shutdown_tx.send(());
}
pub async fn shutdown(&self) {
info!("SwbusServiceServer shutting down");
self.shutdown_ct.cancel();
}

pub async fn start(mut self, config: SwbusConfig) -> Result<()> {
debug!("SwbusServiceServer starting at {}", self.swbus_server_addr);
let addr = self.swbus_server_addr;
if self.swbus_server_addrs.is_empty() {
return Err(SwbusError::input(
SwbusErrorCode::InvalidArgs,
"No server addresses provided.".to_string(),
));
}

debug!("SwbusServiceServer starting at {:?}", self.swbus_server_addrs);

if config.routes.is_empty() {
return Err(SwbusError::input(
Expand Down Expand Up @@ -90,29 +95,54 @@ impl SwbusServiceHost {
self.mux = Some(mux);
let conn_store_clone = conn_store.clone();
self.conn_store = Some(conn_store);
let shutdown_rx = self.shutdown_rx.take().unwrap();

Server::builder()
.add_service(SwbusServiceServer::new(self))
.serve_with_shutdown(addr, async {
shutdown_rx.await.ok();
info!("SwbusServiceServer received shutdown signal");
conn_store_clone.shutdown().await;
})
.await
.map_err(|e| {
SwbusError::connection(
SwbusErrorCode::ConnectionError,
io::Error::other(format!("Failed to listen at {addr}: {e}")),
)
})?;
debug!("SwbusServiceServer terminated");

// Start multiple servers, one for each address
let mut server_handles = Vec::new();
let addrs = self.swbus_server_addrs.clone();

for addr in addrs.into_iter() {
let service = SwbusServiceServer::new(SwbusServiceImpl {
mux: self.mux.clone().unwrap(),
conn_store: self.conn_store.clone().unwrap(),
});

let shutdown_ct_for_server = self.shutdown_ct.clone();
let conn_store_clone = conn_store_clone.clone();
let server_handle = tokio::spawn(async move {
info!("Starting SwbusServiceServer on {}", addr);
Server::builder()
.add_service(service)
.serve_with_shutdown(addr, async move {
shutdown_ct_for_server.cancelled().await;
info!("SwbusServiceServer on {} shutting down", addr);
conn_store_clone.shutdown().await;
})
.await
.map_err(|e| {
SwbusError::connection(
SwbusErrorCode::ConnectionError,
io::Error::other(format!("Failed to listen at {addr}: {e}")),
)
})
});

server_handles.push(server_handle);
}

// Wait for all servers to complete
for handle in server_handles {
handle
.await
.map_err(|e| SwbusError::internal(SwbusErrorCode::Fail, format!("Server task panicked: {e}")))??;
}
debug!("All SwbusServiceServers terminated");

Ok(())
}
}

#[tonic::async_trait]
impl SwbusService for SwbusServiceHost {
impl SwbusService for SwbusServiceImpl {
type StreamMessagesStream = SwbusMessageStream;

#[instrument(name="connection_received", level="info", skip_all, fields(addr=%request.remote_addr().unwrap()))]
Expand Down Expand Up @@ -163,20 +193,15 @@ impl SwbusService for SwbusServiceHost {
let (out_tx, out_rx) = mpsc::channel(16);

let conn_info = Arc::new(SwbusConnInfo::new_server(conn_type, client_addr, service_path));
let conn = SwbusConn::from_incoming_stream(
conn_info,
in_stream,
out_tx,
self.mux.as_ref().unwrap().clone(),
self.conn_store.as_ref().unwrap().clone(),
)
.await;
self.conn_store.as_ref().unwrap().conn_established(conn);
let conn =
SwbusConn::from_incoming_stream(conn_info, in_stream, out_tx, self.mux.clone(), self.conn_store.clone())
.await;
self.conn_store.conn_established(conn);
let out_stream = ReceiverStream::new(out_rx);

// Send server service path in response metadata
let mut response = Response::new(Box::pin(out_stream) as Self::StreamMessagesStream);
let server_service_path = self.mux.as_ref().unwrap().get_my_service_path().to_string();
let server_service_path = self.mux.get_my_service_path().to_string();
response
.metadata_mut()
.insert(SWBUS_SERVER_SERVICE_PATH, server_service_path.parse().unwrap());
Expand Down
Loading