firstbatchxyz
diff --git a/‎Cargo.lock‎
Lines changed: 12 additions & 3 deletions b/‎Cargo.lock‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎compute/Cargo.toml‎
Lines changed: 2 additions & 1 deletion b/‎compute/Cargo.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎compute/src/config.rs‎
Lines changed: 0 additions & 16 deletions b/‎compute/src/config.rs‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎compute/src/main.rs‎
Lines changed: 1 addition & 5 deletions b/‎compute/src/main.rs‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎compute/src/node/core.rs‎
Lines changed: 42 additions & 36 deletions b/‎compute/src/node/core.rs‎
Lines changed: 42 additions & 36 deletions
diff --git a/‎compute/src/node/diagnostic.rs‎
Lines changed: 8 additions & 14 deletions b/‎compute/src/node/diagnostic.rs‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎compute/src/node/mod.rs‎
Lines changed: 16 additions & 14 deletions b/‎compute/src/node/mod.rs‎
Lines changed: 16 additions & 14 deletions
@@ -27,7 +27,7 @@ dotenvy.workspace = true
 base64 = "0.22.0"
 hex = "0.4.3"
 hex-literal = "0.4.1"
-uuid = { version = "1.8.0", features = ["v4"] }
+uuid = { version = "1.8.0", features = ["v4", "serde"] }
 rand.workspace = true
 
 # logging & errors
@@ -59,6 +59,7 @@ public-ip-address = "0.3.2"
 dkn-p2p = { path = "../p2p" }
 dkn-utils = { path = "../utils" }
 dkn-workflows = { path = "../workflows" }
+chrono = { version = "0.4.40", features = ["serde"] }
 
 
 # vendor OpenSSL so that its easier to build cross-platform packages
 
@@ -162,19 +162,3 @@ impl DriaComputeNodeConfig {
         Ok(())
     }
 }
-
-#[cfg(test)]
-impl Default for DriaComputeNodeConfig {
-    /// Creates a new config with dummy values.
-    ///
-    /// Should only be used for testing purposes.
-    fn default() -> Self {
-        env::set_var(
-            "DKN_WALLET_SECRET_KEY",
-            "6e6f64656e6f64656e6f64656e6f64656e6f64656e6f64656e6f64656e6f6465",
-        );
-        env::set_var("DKN_MODELS", "gpt-3.5-turbo");
-
-        Self::new(Default::default())
-    }
-}
@@ -131,11 +131,7 @@ async fn main() -> Result<()> {
     log::info!("Spawning compute node thread.");
     let node_token = cancellation.clone();
     task_tracker.spawn(async move {
-        if let Err(err) = node.run(node_token).await {
-            log::error!("Error within main node loop: {}", err);
-            log::error!("Shutting down node.");
-            node.shutdown().await.expect("could not shutdown node");
-        };
+        node.run(node_token).await;
         log::info!("Closing node.")
     });
 
 
@@ -1,4 +1,4 @@
-use eyre::{eyre, Result};
+use eyre::Result;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 
@@ -7,54 +7,48 @@ use crate::{utils::DriaMessage, DriaComputeNode};
 impl DriaComputeNode {
     /// Runs the main loop of the compute node.
     /// This method is not expected to return until cancellation occurs for the given token.
-    pub async fn run(&mut self, cancellation: CancellationToken) -> Result<()> {
+    pub async fn run(&mut self, cancellation: CancellationToken) {
         /// Number of seconds between refreshing for diagnostic prints.
         const DIAGNOSTIC_REFRESH_INTERVAL_SECS: u64 = 30;
         /// Number of seconds between refreshing the available nodes.
         const AVAILABLE_NODES_REFRESH_INTERVAL_SECS: u64 = 10 * 60;
+        /// Number of seconds between each heartbeat sent to the RPC.
+        const HEARTBEAT_INTERVAL_SECS: u64 = 60;
 
         // prepare durations for sleeps
         let mut diagnostic_refresh_interval =
             tokio::time::interval(Duration::from_secs(DIAGNOSTIC_REFRESH_INTERVAL_SECS));
-        diagnostic_refresh_interval.tick().await; // move one tick
         let mut available_node_refresh_interval =
             tokio::time::interval(Duration::from_secs(AVAILABLE_NODES_REFRESH_INTERVAL_SECS));
-        available_node_refresh_interval.tick().await; // move one tick
+        let mut heartbeat_interval =
+            tokio::time::interval(Duration::from_secs(HEARTBEAT_INTERVAL_SECS));
+
+        // move each one tick
+        available_node_refresh_interval.tick().await;
+        diagnostic_refresh_interval.tick().await;
+        heartbeat_interval.tick().await;
 
         loop {
             tokio::select! {
                 // a task is completed by the worker & should be responded to the requesting peer
                 task_response_msg_opt = self.task_output_rx.recv() => {
-                    let task_response_msg = task_response_msg_opt.ok_or(
-                      eyre!("Publish channel closed unexpectedly, we still have {} batch and {} single tasks.", self.pending_tasks_batch.len(), self.pending_tasks_single.len())
-                    )?; {
-                        if let Err(e) = self.handle_task_response(task_response_msg).await {
-                          log::error!("Error responding to task: {:?}", e);
-                      }
+                    if let Some(task_response_msg) = task_response_msg_opt {
+                        if let Err(e) = self.send_task_output(task_response_msg).await {
+                            log::error!("Error responding to task: {:?}", e);
+                        }
+                    } else {
+                        log::error!("task_output_rx channel closed unexpectedly, we still have {} batch and {} single tasks.", self.pending_tasks_batch.len(), self.pending_tasks_single.len());
+                        break;
                     }
                 },
 
-                // a GossipSub message is received from the channel
-                // // this is expected to be sent by the p2p client
-                // gossipsub_msg_opt = self.gossip_message_rx.recv() => {
-                //     let (propagation_peer_id, message_id, message) = gossipsub_msg_opt.ok_or(eyre!("message_rx channel closed unexpectedly"))?;
-
-                //     // handle the message, returning a message acceptance for the received one
-                //     let acceptance = self.handle_message((propagation_peer_id, &message_id, message)).await;
-
-                //     // validate the message based on the acceptance
-                //     // cant do anything but log if this gives an error as well
-                //     if let Err(e) = self.p2p.validate_message(&message_id, &propagation_peer_id, acceptance).await {
-                //         log::error!("Error validating message {}: {:?}", message_id, e);
-                //     }
-
-                // },
-
-                // a Request is received from the channel, sent by p2p client
-                request_msg_opt = self.request_rx.recv() => {
-                  let request = request_msg_opt.ok_or(eyre!("request_rx channel closed unexpectedly"))?;
-                  if let Err(e) = self.handle_request(request).await {
-                      log::error!("Error handling request: {:?}", e);
+                // a Request or Response is received by the p2p client
+                reqres_msg_opt = self.reqres_rx.recv() => {
+                  if let Some((peer_id, message)) = reqres_msg_opt {
+                    self.handle_reqres(peer_id, message).await;
+                  } else {
+                    log::error!("reqres_rx channel closed unexpectedly.");
+                    break;
                   }
                 },
 
@@ -64,19 +58,28 @@ impl DriaComputeNode {
                 // available nodes are refreshed every now and then
                 _ = available_node_refresh_interval.tick() => self.handle_available_nodes_refresh().await,
 
+                _ = heartbeat_interval.tick() => {
+                  if let Err(e) = self.send_heartbeat().await {
+                    log::error!("Error making heartbeat: {:?}", e);
+                }
+              },
+
                 // check if the cancellation token is cancelled
                 // this is expected to be cancelled by the main thread with signal handling
-                _ = cancellation.cancelled() => break,
+                _ = cancellation.cancelled() => {
+                    log::info!("Cancellation received, shutting down the node.");
+                    break;
+                },
             }
         }
 
         // print one final diagnostic as a summary
         self.handle_diagnostic_refresh().await;
 
         // shutdown channels
-        self.shutdown().await?;
-
-        Ok(())
+        if let Err(e) = self.shutdown().await {
+            log::error!("Could not shutdown the node gracefully: {:?}", e);
+        }
     }
 
     /// Shorthand method to create a signed message with the given data and topic.
@@ -95,9 +98,12 @@ impl DriaComputeNode {
         log::debug!("Sending shutdown command to p2p client.");
         self.p2p.shutdown().await?;
 
-        log::debug!("Closing task response channel.");
+        log::debug!("Closing task output channel.");
         self.task_output_rx.close();
 
+        log::debug!("Closing reqres channel.");
+        self.reqres_rx.close();
+
         Ok(())
     }
 }
@@ -1,12 +1,11 @@
 use colored::Colorize;
 use dkn_p2p::libp2p::multiaddr::Protocol;
 use std::time::Duration;
-use tokio::time::Instant;
 
-use crate::{refresh_dria_nodes, utils::get_steps, DriaComputeNode, DRIA_COMPUTE_NODE_VERSION};
+use crate::{refresh_dria_nodes, utils::get_points, DriaComputeNode, DRIA_COMPUTE_NODE_VERSION};
 
-/// Number of seconds such that if the last ping is older than this, the node is considered unreachable.
-const PING_LIVENESS_SECS: u64 = 150;
+/// Number of seconds such that if the last heartbeat ACK is older than this, the node is considered unreachable.
+const HEARTBEAT_LIVENESS_SECS: Duration = Duration::from_secs(150);
 
 impl DriaComputeNode {
     /// Returns the task count within the channels, `single` and `batch`.
@@ -23,7 +22,7 @@ impl DriaComputeNode {
         let mut diagnostics = vec![format!("Diagnostics (v{}):", DRIA_COMPUTE_NODE_VERSION)];
 
         // print steps
-        if let Ok(steps) = get_steps(&self.config.address).await {
+        if let Ok(steps) = get_points(&self.config.address).await {
             let earned = steps.score - self.initial_steps;
             diagnostics.push(format!(
                 "$DRIA Points: {} total, {} earned in this run, within top {}%",
@@ -55,10 +54,10 @@ impl DriaComputeNode {
                 .join(", ")
         ));
 
-        // add network status as well
         // if we have not received pings for a while, we are considered offline
-        let is_offline = Instant::now().duration_since(self.last_heartbeat_at)
-            > Duration::from_secs(PING_LIVENESS_SECS);
+        let is_offline = chrono::Utc::now() > self.last_heartbeat_at + HEARTBEAT_LIVENESS_SECS;
+
+        // if we have not yet received a heartbeat response, we are still connecting
         if self.num_heartbeats == 0 {
             // if we didnt have any pings, we might still be connecting
             diagnostics.push(format!("Node Status: {}", "CONNECTING".yellow()));
@@ -73,18 +72,13 @@ impl DriaComputeNode {
             ));
         }
 
-        // add pings per second
-        let elapsed = Instant::now().duration_since(self.started_at).as_secs_f64();
-        let pings_per_second = self.num_heartbeats as f64 / elapsed; // elapsed is always > 0
-        diagnostics.push(format!("Pings/sec: {:.3}", pings_per_second));
-
         log::info!("{}", diagnostics.join("\n  "));
 
         // if offline, print this error message as well
         if is_offline {
             log::error!(
                 "Node has not received any pings for at least {} seconds & it may be unreachable!\nPlease restart your node!",
-                PING_LIVENESS_SECS
+                HEARTBEAT_LIVENESS_SECS.as_secs()
             );
         }
 
 
@@ -1,14 +1,14 @@
 use dkn_p2p::{
-    libp2p::{request_response::ResponseChannel, PeerId},
+    libp2p::{request_response, PeerId},
     DriaNodes, DriaP2PClient, DriaP2PCommander, DriaP2PProtocol,
 };
 use eyre::Result;
 use std::collections::HashMap;
-use tokio::{sync::mpsc, time::Instant};
+use tokio::sync::mpsc;
 
 use crate::{
     config::*,
-    utils::{crypto::secret_to_keypair, get_steps, refresh_dria_nodes, SpecCollector},
+    utils::{crypto::secret_to_keypair, get_points, refresh_dria_nodes, SpecCollector},
     workers::task::{TaskWorker, TaskWorkerInput, TaskWorkerMetadata, TaskWorkerOutput},
 };
 
@@ -27,13 +27,14 @@ pub struct DriaComputeNode {
     pub p2p: DriaP2PCommander,
     /// The last time the node had an acknowledged heartbeat.
     /// If this is too much, we can say that the node is not reachable by RPC.
-    pub(crate) last_heartbeat_at: Instant,
+    pub(crate) last_heartbeat_at: chrono::DateTime<chrono::Utc>,
     /// Number of pings received.
     pub(crate) num_heartbeats: u64,
-    /// The time the node was started.
-    pub(crate) started_at: Instant,
-    /// Request-response request receiver.
-    request_rx: mpsc::Receiver<(PeerId, Vec<u8>, ResponseChannel<Vec<u8>>)>,
+    /// A mapping of heartbeat UUIDs to their deadlines.
+    /// This is used to track the heartbeats, and their acknowledgements.
+    pub(crate) heartbeats: HashMap<uuid::Uuid, chrono::DateTime<chrono::Utc>>,
+    /// Request-response message receiver, can have both a request or a response.
+    reqres_rx: mpsc::Receiver<(PeerId, request_response::Message<Vec<u8>, Vec<u8>>)>,
     /// Task response receiver, will respond to the request-response channel with the given result.
     task_output_rx: mpsc::Receiver<TaskWorkerOutput>,
     /// Task worker transmitter to send batchable tasks.
@@ -109,7 +110,7 @@ impl DriaComputeNode {
 
         let model_names = config.workflows.get_model_names();
 
-        let initial_steps = get_steps(&config.address)
+        let initial_steps = get_points(&config.address)
             .await
             .map(|s| s.score)
             .unwrap_or_default();
@@ -121,7 +122,7 @@ impl DriaComputeNode {
                 dria_nodes,
                 // receivers
                 task_output_rx: publish_rx,
-                request_rx,
+                reqres_rx: request_rx,
                 // transmitters
                 task_request_batch_tx: task_batch_tx,
                 task_request_single_tx: task_single_tx,
@@ -130,12 +131,13 @@ impl DriaComputeNode {
                 pending_tasks_batch: HashMap::new(),
                 completed_tasks_single: 0,
                 completed_tasks_batch: 0,
-                // others
+                // heartbeats
+                heartbeats: HashMap::new(),
+                last_heartbeat_at: chrono::Utc::now(),
+                num_heartbeats: 0,
+                // misc
                 initial_steps,
                 spec_collector: SpecCollector::new(model_names),
-                last_heartbeat_at: Instant::now(),
-                num_heartbeats: 0,
-                started_at: Instant::now(),
             },
             p2p_client,
             task_batch_worker,