added RPC refresh logic

erhant · erhant · commit 95f5027bf135 · 2025-03-26T17:43:52.000+03:00
diff --git a/compute/Cargo.toml b/compute/Cargo.toml
@@ -46,7 +46,7 @@ fastbloom-rs = "0.5.9"
 # machine diagnostics
 # system info
 sysinfo = "0.33.1"
-# gpu info TODO: this gives a build error on Windows
+# gpu info FIXME: this gives a build error on Windows
 # wgpu = { version = "23.0.1", features = [
 #     "serde",
 #     "dx12",
diff --git a/compute/src/node/core.rs b/compute/src/node/core.rs
@@ -1,4 +1,5 @@
-use eyre::Result;
+use dkn_p2p::libp2p::{Multiaddr, PeerId};
+use eyre::{eyre, Result};
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 
@@ -90,6 +91,18 @@ impl DriaComputeNode {
         DriaMessage::new(data, topic, self.p2p.protocol(), &self.config.secret_key)
     }
 
+    /// Dial the given peer at the given address.
+    pub async fn dial_with_timeout(&mut self, peer_id: PeerId, addr: Multiaddr) -> Result<()> {
+        // while not yet known, some people get stuck during the dialling step,
+        // this timeout prevents that.
+        const DIAL_TIMEOUT: Duration = Duration::from_secs(10);
+
+        match tokio::time::timeout(DIAL_TIMEOUT, self.p2p.dial(peer_id, addr)).await {
+            Err(timeout) => Err(eyre!("Timeout dialling RPC node: {}", timeout)),
+            Ok(result) => result, // this is also a `Result` enum
+        }
+    }
+
     /// Shutdown channels between p2p, worker and yourself.
     ///
     /// Can be inlined as it is called only once from very few places.
diff --git a/compute/src/node/diagnostic.rs b/compute/src/node/diagnostic.rs
@@ -1,7 +1,10 @@
 use colored::Colorize;
 use std::time::Duration;
 
-use crate::{utils::get_points, DriaComputeNode, DRIA_COMPUTE_NODE_VERSION};
+use crate::{
+    utils::{get_points, DriaRPC},
+    DriaComputeNode, DRIA_COMPUTE_NODE_VERSION,
+};
 
 /// Number of seconds such that if the last heartbeat ACK is older than this, the node is considered unreachable.
 const HEARTBEAT_LIVENESS_SECS: Duration = Duration::from_secs(150);
@@ -82,29 +85,47 @@ impl DriaComputeNode {
         }
     }
 
-    /// Updates the local list of available nodes by refreshing it.
-    /// Dials the RPC nodes again for better connectivity.
+    /// Dials the existing RPC node if we are not connected to it.
+    ///
+    /// If there is an error while doing that,
+    /// it will try to get a new RPC node and dial it.
     pub(crate) async fn handle_available_nodes_refresh(&mut self) {
-        log::info!("Refreshing available Dria nodes.");
+        log::debug!("Checking RPC connections for diagnostics.");
 
-        // FIXME: what to do for refreshing nodes
-        // if let Err(e) = refresh_dria_nodes(&mut self.dria_nodes).await {
-        //     log::error!("Error refreshing available nodes: {:?}", e);
-        // };
-
-        // TODO: check if we are connected to the node, and dial again if not
-
-        // dial the RPC
-        log::info!("Dialling RPC at: {}", self.dria_nodes.addr);
-        let fut = self
+        // check if we are connected
+        let is_connected = self
             .p2p
-            .dial(self.dria_nodes.peer_id, self.dria_nodes.addr.clone());
-        match tokio::time::timeout(Duration::from_secs(10), fut).await {
-            Err(timeout) => log::error!("Timeout dialling RPC node: {:?}", timeout),
-            Ok(res) => match res {
-                Err(e) => log::warn!("Error dialling RPC node: {:?}", e),
-                Ok(_) => log::info!("Successfully dialled RPC!"),
-            },
-        };
+            .is_connected(self.dria_rpc.peer_id)
+            .await
+            .unwrap_or(false);
+
+        // if we are not connected, try to dial it again
+        if !is_connected {
+            log::info!("Dialling RPC at: {}", self.dria_rpc.addr);
+            if let Err(err) = self
+                .dial_with_timeout(self.dria_rpc.peer_id, self.dria_rpc.addr.clone())
+                .await
+            {
+                // if we also cannot dial it, get a new RPC node
+                log::warn!(
+                    "Could not dial to RPC at: {}: {err:?}\nWill get a new RPC node.",
+                    self.dria_rpc.addr,
+                );
+                self.dria_rpc = DriaRPC::new(self.dria_rpc.network).await;
+
+                // now dial this new RPC again
+                if let Err(err) = self
+                    .dial_with_timeout(self.dria_rpc.peer_id, self.dria_rpc.addr.clone())
+                    .await
+                {
+                    // worst-case we cant dial this one too, just leave it for the next diagnostic
+                    log::error!("Could not dial the new RPC: {err:?}\nWill try again in the next diagnostic refresh.");
+                }
+            } else {
+                log::info!("Successfully dialled to RPC at: {}", self.dria_rpc.addr);
+            }
+        } else {
+            log::debug!("Connection with {} is intact.", self.dria_rpc.peer_id);
+        }
     }
 }
diff --git a/compute/src/node/mod.rs b/compute/src/node/mod.rs
@@ -21,8 +21,8 @@ const PUBLISH_CHANNEL_BUFSIZE: usize = 1024;
 
 pub struct DriaComputeNode {
     pub config: DriaComputeNodeConfig,
-    /// Pre-defined nodes that belong to Dria, e.g. bootstraps, relays and RPCs.
-    pub dria_nodes: DriaRPC,
+    /// Chosen RPC node.
+    pub dria_rpc: DriaRPC,
     /// Peer-to-peer client commander to interact with the network.
     pub p2p: DriaP2PCommander,
     /// The last time the node had an acknowledged heartbeat.
@@ -116,7 +116,7 @@ impl DriaComputeNode {
             DriaComputeNode {
                 config,
                 p2p: p2p_commander,
-                dria_nodes,
+                dria_rpc: dria_nodes,
                 // receivers
                 task_output_rx: publish_rx,
                 reqres_rx: request_rx,
diff --git a/compute/src/node/reqres.rs b/compute/src/node/reqres.rs
@@ -30,9 +30,9 @@ impl DriaComputeNode {
                 log::info!("Received a request ({}) from {}", request_id, peer_id);
 
                 // ensure that message is from the known RPCs
-                if self.dria_nodes.peer_id != peer_id {
+                if self.dria_rpc.peer_id != peer_id {
                     log::warn!("Received request from unauthorized source: {}", peer_id);
-                    log::debug!("Allowed source: {}", self.dria_nodes.peer_id);
+                    log::debug!("Allowed source: {}", self.dria_rpc.peer_id);
                 } else if let Err(e) = self.handle_request(peer_id, request, channel).await {
                     log::error!("Error handling request: {:?}", e);
                 }
@@ -194,7 +194,7 @@ impl DriaComputeNode {
     /// Sends a heartbeat request to the configured RPC node.
     #[inline]
     pub(crate) async fn send_heartbeat(&mut self) -> Result<()> {
-        let peer_id = self.dria_nodes.peer_id;
+        let peer_id = self.dria_rpc.peer_id;
         let request_id = HeartbeatRequester::send_heartbeat(self, peer_id).await?;
         log::info!("Sent heartbeat request ({}) to {}", request_id, peer_id);
 
diff --git a/compute/src/reqres/heartbeat.rs b/compute/src/reqres/heartbeat.rs
@@ -32,8 +32,7 @@ pub struct HeartbeatRequest {
 pub struct HeartbeatResponse {
     /// UUID as given in the request.
     pub(crate) heartbeat_id: Uuid,
-    /// An associated error with the response,
-    ///
+    /// An associated error with the response:
     /// - `None` means that the heartbeat was acknowledged.
     /// - `Some` means that the heartbeat was not acknowledged for the given reason.
     pub(crate) error: Option<String>,
@@ -66,10 +65,9 @@ impl HeartbeatRequester {
             .p2p
             .request(
                 peer_id,
-                serde_json::to_vec(&heartbeat_request).expect("TODO: !!!"),
+                serde_json::to_vec(&heartbeat_request).expect("should be serializable"),
             )
-            .await
-            .expect("TODO: !!!");
+            .await?;
 
         // add it to local heartbeats set
         node.heartbeats.insert(uuid, deadline);
diff --git a/compute/src/utils/rpc.rs b/compute/src/utils/rpc.rs
@@ -3,6 +3,7 @@ use dkn_p2p::DriaNetworkType;
 use eyre::Result;
 use std::fmt::Debug;
 
+/// The connected RPC node, as per the Star network topology.
 #[derive(Debug, Clone)]
 pub struct DriaRPC {
     pub addr: Multiaddr,
@@ -13,14 +14,16 @@ pub struct DriaRPC {
 impl DriaRPC {
     /// Creates a new `AvailableNodes` struct for the given network type.
     pub async fn new(network: DriaNetworkType) -> Self {
-        let addr = refresh_rpc_addr(&network).await.expect("TODO: !!!");
+        let addr = refresh_rpc_addr(&network)
+            .await
+            .expect("could not get RPC address");
         let peer_id = addr
             .iter()
             .find_map(|p| match p {
                 Protocol::P2p(peer_id) => Some(peer_id),
                 _ => None,
             })
-            .expect("TODO: !!!");
+            .expect("returned address does not contain a peer id");
 
         Self {
             addr,
@@ -63,22 +66,4 @@ mod tests {
         let node = DriaRPC::new(DriaNetworkType::Community).await;
         println!("{:?}", node);
     }
-
-    #[tokio::test]
-    async fn test_extract_peer_id() {
-        let addr: Multiaddr =
-            "/ip4/98.85.74.179/tcp/4001/p2p/16Uiu2HAmH4YGRWuJSvo5bxdShozKSve1WaZMGzAr3GiNNzadsdaN"
-                .parse()
-                .unwrap();
-        let expected_peer_id: PeerId = "16Uiu2HAmH4YGRWuJSvo5bxdShozKSve1WaZMGzAr3GiNNzadsdaN"
-            .parse()
-            .unwrap();
-
-        let peer_id = addr.iter().find_map(|p| match p {
-            Protocol::P2p(peer_id) => Some(peer_id),
-            _ => None,
-        });
-
-        assert_eq!(Some(expected_peer_id), peer_id);
-    }
 }
diff --git a/compute/src/workers/task.rs b/compute/src/workers/task.rs
@@ -332,7 +332,7 @@ mod tests {
         log::info!("Got all results, closing channel.");
         publish_rx.close();
 
-        // TODO: this bugs out
+        // FIXME: this bugs out
         worker_handle.await.unwrap();
         log::info!("Done.");
     }
diff --git a/p2p/README.md b/p2p/README.md
@@ -25,35 +25,19 @@ Here is an example where we create the said entities:
 ```rs
 use dkn_p2p::{DriaP2PClient, DriaP2PProtocol};
 
-// your wallet, or something random maybe
-let keypair = Keypair::generate_secp256k1();
-
-// your listen address
-let addr = Multiaddr::from_str("/ip4/0.0.0.0/tcp/4001")?;
-
-// static bootstrap & relay & rpc addresses
-let bootstraps = vec![Multiaddr::from_str(
-    "some-multiaddrs-here"
-)?];
-let relays = vec![Multiaddr::from_str(
-    "some-multiaddrs-here"
-)?];
-let rpcs = vec![Multiaddr::from_str(
-    "some-multiaddrs-here"
-)?];
-
-let protocol = "0.2";
+let keypair = Keypair::generate_secp256k1(); // or your wallet
+let listen_addr = Multiaddr::from_str("/ip4/0.0.0.0/tcp/4001")?;
+let rpc_addr = Multiaddr::from_str("some-multiaddr-here")?;
+let protocol = "0.4"; // DKN protocol version
 
 // `new` returns 3 things:
 // - p2p client itself, to be given to a thread
 // - p2p commander, a small client to be able to speak with the p2p in another thread
 // - `msg_rx`, the channel to listen for gossipsub messages
 let (client, mut commander, mut msg_rx) = DriaP2PClient::new(
   keypair,
-  addr,
-  bootstraps,
-  relays,
-  rpc,
+  listen_addr,
+  rpc_addr,
   protocol
 )?;
 ```
diff --git a/p2p/src/behaviour.rs b/p2p/src/behaviour.rs
@@ -11,17 +11,13 @@ pub struct DriaBehaviour {
 }
 
 impl DriaBehaviour {
-    pub fn new(
-        key: &Keypair,
-        identity_protocol: String,
-        reqres_protocol: StreamProtocol,
-    ) -> Result<Self> {
+    pub fn new(key: &Keypair, identity_protocol: String, reqres_protocol: StreamProtocol) -> Self {
         let public_key = key.public();
 
-        Ok(Self {
+        Self {
             identify: create_identify_behaviour(public_key, identity_protocol),
             request_response: create_request_response_behaviour(reqres_protocol),
-        })
+        }
     }
 }
 
diff --git a/p2p/src/client.rs b/p2p/src/client.rs
@@ -39,8 +39,6 @@ const MSG_CHANNEL_BUFSIZE: usize = 1024;
 impl DriaP2PClient {
     /// Creates a new P2P client with the given keypair and listen address.
     ///
-    /// Can provide a list of bootstrap and relay nodes to connect to as well at the start, and RPC addresses to dial preemptively.
-    ///
     /// The `version` is used to create the protocol strings for the client, and its very important that
     /// they match with the clients existing within the network.
     ///
@@ -66,7 +64,6 @@ impl DriaP2PClient {
             )?
             .with_behaviour(|key| {
                 DriaBehaviour::new(key, protocol.identity(), protocol.request_response())
-                    .expect("TODO: !!!")
             })?
             .with_swarm_config(|c| {
                 c.with_idle_connection_timeout(Duration::from_secs(IDLE_CONNECTION_TIMEOUT_SECS))
@@ -137,6 +134,9 @@ impl DriaP2PClient {
                     .build();
                 let _ = sender.send(self.swarm.dial(opts));
             }
+            DriaP2PCommand::IsConnected { peer_id, sender } => {
+                let _ = sender.send(self.swarm.is_connected(&peer_id));
+            }
             DriaP2PCommand::NetworkInfo { sender } => {
                 let _ = sender.send(self.swarm.network_info());
             }
@@ -240,22 +240,17 @@ impl DriaP2PClient {
                 ..
             })) => self.handle_identify_event(peer_id, info),
 
-            // log listen addreses
             SwarmEvent::NewListenAddr { address, .. } => {
                 log::warn!("Local node is listening on {}", address);
             }
-
-            // add external address of peers to Kademlia routing table
             SwarmEvent::NewExternalAddrOfPeer { peer_id, address } => {
                 log::info!(
                     "External address of peer {} confirmed: {}",
                     peer_id,
                     address
                 );
             }
-            // add your own peer_id to kademlia as well
             SwarmEvent::ExternalAddrConfirmed { address } => {
-                // this is usually the external address via relay
                 log::info!("External address confirmed: {}", address);
             }
 
diff --git a/p2p/src/commands.rs b/p2p/src/commands.rs
@@ -10,6 +10,11 @@ pub enum DriaP2PCommand {
     NetworkInfo {
         sender: oneshot::Sender<swarm::NetworkInfo>,
     },
+    /// Check if there is an active connection to the given peer.
+    IsConnected {
+        peer_id: PeerId,
+        sender: oneshot::Sender<bool>,
+    },
     /// Dial a known peer.
     Dial {
         peer_id: PeerId,
@@ -122,6 +127,18 @@ impl DriaP2PCommander {
             .wrap_err("could not dial")
     }
 
+    /// Checks if there is an active connection to the given peer.
+    pub async fn is_connected(&mut self, peer_id: PeerId) -> Result<bool> {
+        let (sender, receiver) = oneshot::channel();
+
+        self.sender
+            .send(DriaP2PCommand::IsConnected { peer_id, sender })
+            .await
+            .wrap_err("could not send")?;
+
+        receiver.await.wrap_err("could not receive")
+    }
+
     /// Sends a shutdown signal to the client.
     pub async fn shutdown(&mut self) -> Result<()> {
         let (sender, receiver) = oneshot::channel();
diff --git a/workflows/src/bin/tps.rs b/workflows/src/bin/tps.rs

Original file line number	Diff line number	Diff line change
`@@ -332,7 +332,7 @@ mod tests {`
`332`	`332`	`log::info!("Got all results, closing channel.");`
`333`	`333`	`publish_rx.close();`
`334`	`334`
`335`		`- // TODO: this bugs out`
	`335`	`+ // FIXME: this bugs out`
`336`	`336`	`worker_handle.await.unwrap();`
`337`	`337`	`log::info!("Done.");`
`338`	`338`	`}`
Original file line number	Diff line number	Diff line change
`@@ -11,17 +11,13 @@ pub struct DriaBehaviour {`
`11`	`11`	`}`
`12`	`12`
`13`	`13`	`impl DriaBehaviour {`
`14`		`- pub fn new(`
`15`		`- key: &Keypair,`
`16`		`- identity_protocol: String,`
`17`		`- reqres_protocol: StreamProtocol,`
`18`		`- ) -> Result<Self> {`
	`14`	`+ pub fn new(key: &Keypair, identity_protocol: String, reqres_protocol: StreamProtocol) -> Self {`
`19`	`15`	`let public_key = key.public();`
`20`	`16`
`21`		`- Ok(Self {`
	`17`	`+ Self {`
`22`	`18`	`identify: create_identify_behaviour(public_key, identity_protocol),`
`23`	`19`	`request_response: create_request_response_behaviour(reqres_protocol),`
`24`		`- })`
	`20`	`+ }`
`25`	`21`	`}`
`26`	`22`	`}`
`27`	`23`