task completion reports in the end, some readme rfks & removals

erhant · erhant · commit f8f431858064 · 2024-11-29T15:08:31.000+03:00
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
diff --git a/README.md b/README.md
@@ -28,15 +28,11 @@
 
 ## About
 
-A **Dria Compute Node** is a unit of computation within the Dria Knowledge Network. It's purpose is to process tasks given by the **Dria Admin Node**. To get started, see [node guide](./docs/NODE_GUIDE.md)!
-
-### Tasks
-
 Compute nodes can technically do any arbitrary task, from computing the square root of a given number to finding LLM outputs from a given prompt, or validating an LLM's output with respect to knowledge available on the web accessed via tools.
 
-- **Ping/Pong**: Dria Admin Node broadcasts **ping** messages at a set interval, it is a required duty of the compute node to respond with a **pong** to these so that they can be included in the list of available nodes for task assignment. These tasks will respect the type of model provided within the pong message, e.g. if a task requires `gpt-4o` and you are running `phi3`, you won't be selected for that task.
+- **Heartbeats**: Every few seconds, a heartbeat ping is published into the network, and every compute node responds with a digitally-signed pong message to indicate that they are alive, along with additional information such as which nodes they are running & how many tasks they have so far.
 
-- **Workflows**: Each task is given in the form of a workflow, based on [Ollama Workflows](https://github.com/andthattoo/ollama-workflows). In simple terms, each workflow defines the agentic behavior of an LLM, all captured in a single JSON file, and can represent things ranging from simple LLM generations to iterative web searching.
+- **Workflows**: Each task is given in the form of a [workflow](https://github.com/andthattoo/ollama-workflows). Every workflow defines an agentic behavior for the chosen LLM, all captured in a single JSON file, and can represent things ranging from simple LLM generations to iterative web searching & reasoning.
 
 ## Node Running
 
diff --git a/compute/src/handlers/workflow.rs b/compute/src/handlers/workflow.rs
@@ -54,10 +54,7 @@ impl WorkflowHandler {
 
         // check task inclusion via the bloom filter
         if !task.filter.contains(&node.config.address)? {
-            log::info!(
-                "Task {} does not include this node within the filter.",
-                task.task_id
-            );
+            log::info!("Task {} ignored due to filter.", task.task_id);
 
             // accept the message, someone else may be included in filter
             return Ok(Either::Left(MessageAcceptance::Accept));
diff --git a/compute/src/main.rs b/compute/src/main.rs
@@ -7,9 +7,6 @@ use tokio_util::{sync::CancellationToken, task::TaskTracker};
 async fn main() -> Result<()> {
     let dotenv_result = dotenvy::dotenv();
 
-    // TODO: remove me later when the launcher is fixed
-    amend_log_levels();
-
     env_logger::builder()
         .format_timestamp(Some(env_logger::TimestampPrecision::Millis))
         .init();
@@ -75,21 +72,23 @@ async fn main() -> Result<()> {
     // create the node
     let (mut node, p2p, worker_batch, worker_single) = DriaComputeNode::new(config).await?;
 
-    // spawn threads
+    // spawn p2p client first
     log::info!("Spawning peer-to-peer client thread.");
     task_tracker.spawn(async move { p2p.run().await });
 
+    // spawn batch worker thread if we are using such models (e.g. OpenAI, Gemini, OpenRouter)
     if let Some(mut worker_batch) = worker_batch {
         log::info!("Spawning workflows batch worker thread.");
         task_tracker.spawn(async move { worker_batch.run_batch().await });
     }
 
+    // spawn single worker thread if we are using such models (e.g. Ollama)
     if let Some(mut worker_single) = worker_single {
         log::info!("Spawning workflows single worker thread.");
         task_tracker.spawn(async move { worker_single.run().await });
     }
 
-    // launch the node in a separate thread
+    // spawn compute node thread
     log::info!("Spawning compute node thread.");
     let node_token = cancellation.clone();
     task_tracker.spawn(async move {
@@ -165,37 +164,3 @@ async fn wait_for_termination(cancellation: CancellationToken) -> Result<()> {
 
     Ok(())
 }
-
-// #[deprecated]
-/// Very CRUDE fix due to launcher log level bug
-///
-/// TODO: remove me later when the launcher is fixed
-pub fn amend_log_levels() {
-    if let Ok(rust_log) = std::env::var("RUST_LOG") {
-        let log_level = if rust_log.contains("dkn_compute=info") {
-            "info"
-        } else if rust_log.contains("dkn_compute=debug") {
-            "debug"
-        } else if rust_log.contains("dkn_compute=trace") {
-            "trace"
-        } else {
-            return;
-        };
-
-        // check if it contains other log levels
-        let mut new_rust_log = rust_log.clone();
-        if !rust_log.contains("dkn_p2p") {
-            new_rust_log = format!("{},{}={}", new_rust_log, "dkn_p2p", log_level);
-        }
-        if !rust_log.contains("dkn_workflows") {
-            new_rust_log = format!("{},{}={}", new_rust_log, "dkn_workflows", log_level);
-        }
-        std::env::set_var("RUST_LOG", new_rust_log);
-    } else {
-        // TODO: use env_logger default function instead of this
-        std::env::set_var(
-            "RUST_LOG",
-            "none,dkn_compute=info,dkn_p2p=info,dkn_workflows=info",
-        );
-    }
-}
diff --git a/compute/src/node.rs b/compute/src/node.rs
@@ -42,6 +42,10 @@ pub struct DriaComputeNode {
     pending_tasks_single: HashSet<String>,
     // Batch tasks hash-map
     pending_tasks_batch: HashSet<String>,
+    /// Completed single tasks count
+    completed_tasks_single: usize,
+    /// Completed batch tasks count
+    completed_tasks_batch: usize,
 }
 
 impl DriaComputeNode {
@@ -114,6 +118,8 @@ impl DriaComputeNode {
                 workflow_single_tx,
                 pending_tasks_single: HashSet::new(),
                 pending_tasks_batch: HashSet::new(),
+                completed_tasks_single: 0,
+                completed_tasks_batch: 0,
             },
             p2p_client,
             workflows_batch_worker,
@@ -317,8 +323,14 @@ impl DriaComputeNode {
                     if let Some(publish_msg) = publish_msg_opt {
                         // remove the task from pending tasks based on its batchability
                         match publish_msg.batchable {
-                            true => self.pending_tasks_batch.remove(&publish_msg.task_id),
-                            false => self.pending_tasks_single.remove(&publish_msg.task_id),
+                            true => {
+                                self.completed_tasks_batch += 1;
+                                self.pending_tasks_batch.remove(&publish_msg.task_id);
+                            },
+                            false => {
+                                self.completed_tasks_single += 1;
+                                self.pending_tasks_single.remove(&publish_msg.task_id);
+                            }
                         };
 
                         // publish the message
@@ -357,6 +369,9 @@ impl DriaComputeNode {
         self.unsubscribe(WorkflowHandler::LISTEN_TOPIC).await?;
         self.unsubscribe(WorkflowHandler::RESPONSE_TOPIC).await?;
 
+        // print one final diagnostic as a summary
+        self.handle_diagnostic_refresh().await;
+
         // shutdown channels
         self.shutdown().await?;
 
@@ -385,9 +400,16 @@ impl DriaComputeNode {
             Err(e) => log::error!("Error getting peer counts: {:?}", e),
         }
 
-        // print task counts
+        // print tasks count
         let [single, batch] = self.get_pending_task_count();
-        log::info!("Pending Task Count (single/batch): {} / {}", single, batch);
+        log::info!("Pending Tasks (single/batch):   {} / {}", single, batch);
+
+        // completed tasks count
+        log::debug!(
+            "Completed Tasks (single/batch): {} / {}",
+            self.completed_tasks_single,
+            self.completed_tasks_batch
+        );
     }
 
     /// Updates the local list of available nodes by refreshing it.
diff --git a/compute/src/workers/workflow.rs b/compute/src/workers/workflow.rs
@@ -26,6 +26,9 @@ pub struct WorkflowsWorkerOutput {
     pub batchable: bool,
 }
 
+/// Workflows worker is a task executor that can process workflows in parallel / series.
+///
+/// It is expected to be spawned in another thread, with `run_batch` for batch processing and `run` for single processing.
 pub struct WorkflowsWorker {
     workflow_rx: mpsc::Receiver<WorkflowsWorkerInput>,
     publish_tx: mpsc::Sender<WorkflowsWorkerOutput>,
@@ -95,12 +98,6 @@ impl WorkflowsWorker {
                 .workflow_rx
                 .recv_many(&mut task_buffer, Self::BATCH_SIZE)
                 .await;
-            debug_assert!(
-                num_tasks <= Self::BATCH_SIZE,
-                "drain cant be larger than batch size"
-            );
-            // TODO: just to be sure, can be removed later
-            debug_assert_eq!(num_tasks, task_buffer.len());
 
             if num_tasks == 0 {
                 return self.shutdown();
@@ -186,15 +183,14 @@ impl WorkflowsWorker {
                 }
                 _ => {
                     unreachable!(
-                        "drain cant be larger than batch size ({} > {})",
+                        "number of tasks cant be larger than batch size ({} > {})",
                         num_tasks,
                         Self::BATCH_SIZE
                     );
                 }
             };
 
             // publish all results
-            // TODO: make this a part of executor as well
             log::info!("Publishing {} workflow results", results.len());
             for result in results {
                 if let Err(e) = self.publish_tx.send(result).await {
diff --git a/p2p/src/client.rs b/p2p/src/client.rs
@@ -244,7 +244,15 @@ impl DriaP2PClient {
                 let _ = sender.send((mesh, all));
             }
             DriaP2PCommand::Shutdown { sender } => {
+                // close the command channel
                 self.cmd_rx.close();
+
+                // remove own peerId from Kademlia DHT
+                let peer_id = self.swarm.local_peer_id().clone();
+                self.swarm.behaviour_mut().kademlia.remove_peer(&peer_id);
+
+                // remove own peerId from Autonat server list
+                self.swarm.behaviour_mut().autonat.remove_server(&peer_id);
                 let _ = sender.send(());
             }
         }
diff --git a/workflows/README.md b/workflows/README.md
@@ -20,13 +20,13 @@ Note that the underlying [Ollama Workflows](https://github.com/andthattoo/ollama
 
 ## Usage
 
-DKN Workflows make use of several environment variables, respecting the providers.
+DKN Workflows make use of several environment variables, with respect to several model providers.
 
-- `OLLAMA_HOST` is used to connect to Ollama server
-- `OLLAMA_PORT` is used to connect to Ollama server
+- `OLLAMA_HOST` is used to connect to **Ollama** server
+- `OLLAMA_PORT` is used to connect to **Ollama** server
 - `OLLAMA_AUTO_PULL` indicates whether we should pull missing models automatically or not
-- `OPENAI_API_KEY` is used for OpenAI requests
-- `GEMINI_API_KEY` is used for Gemini requests
+- `OPENAI_API_KEY` is used for **OpenAI** requests
+- `GEMINI_API_KEY` is used for **Gemini** requests
 - `SERPER_API_KEY` is optional API key to use **Serper**, for better Workflow executions
 - `JINA_API_KEY` is optional API key to use **Jina**, for better Workflow executions
 
diff --git a/workflows/tests/models_test.rs b/workflows/tests/models_test.rs
@@ -1,14 +1,21 @@
 use dkn_workflows::{DriaWorkflowsConfig, Model, ModelProvider};
 use eyre::Result;
-use std::env;
 
-const LOG_LEVEL: &str = "none,dkn_workflows=debug";
+fn setup() {
+    // read api key from .env
+    let _ = dotenvy::dotenv();
+
+    // set logger
+    let _ = env_logger::builder()
+        .parse_filters("none,dkn_workflows=debug")
+        .is_test(true)
+        .try_init();
+}
 
 #[tokio::test]
 #[ignore = "requires Ollama"]
 async fn test_ollama_check() -> Result<()> {
-    env::set_var("RUST_LOG", LOG_LEVEL);
-    let _ = env_logger::builder().is_test(true).try_init();
+    setup();
 
     let models = vec![Model::Phi3_5Mini];
     let mut model_config = DriaWorkflowsConfig::new(models);
@@ -25,9 +32,7 @@ async fn test_ollama_check() -> Result<()> {
 #[tokio::test]
 #[ignore = "requires OpenAI"]
 async fn test_openai_check() -> Result<()> {
-    let _ = dotenvy::dotenv(); // read api key
-    env::set_var("RUST_LOG", LOG_LEVEL);
-    let _ = env_logger::builder().is_test(true).try_init();
+    setup();
 
     let models = vec![Model::GPT4Turbo];
     let mut model_config = DriaWorkflowsConfig::new(models);
@@ -41,11 +46,25 @@ async fn test_openai_check() -> Result<()> {
 }
 
 #[tokio::test]
-async fn test_empty() -> Result<()> {
-    let mut model_config = DriaWorkflowsConfig::new(vec![]);
+#[ignore = "requires Gemini"]
+async fn test_gemini_check() -> Result<()> {
+    setup();
 
-    let result = model_config.check_services().await;
-    assert!(result.is_err());
+    let models = vec![Model::Gemini15Flash];
+    let mut model_config = DriaWorkflowsConfig::new(models);
+    model_config.check_services().await?;
 
+    assert_eq!(
+        model_config.models[0],
+        (ModelProvider::Gemini, Model::Gemini15Flash)
+    );
     Ok(())
 }
+
+#[tokio::test]
+async fn test_empty() {
+    assert!(DriaWorkflowsConfig::new(vec![])
+        .check_services()
+        .await
+        .is_err());
+}

Original file line number	Diff line number	Diff line change
`@@ -244,7 +244,15 @@ impl DriaP2PClient {`
`244`	`244`	`let _ = sender.send((mesh, all));`
`245`	`245`	`}`
`246`	`246`	`DriaP2PCommand::Shutdown { sender } => {`
	`247`	`+ // close the command channel`
`247`	`248`	`self.cmd_rx.close();`
	`249`	`+`
	`250`	`+ // remove own peerId from Kademlia DHT`
	`251`	`+ let peer_id = self.swarm.local_peer_id().clone();`
	`252`	`+ self.swarm.behaviour_mut().kademlia.remove_peer(&peer_id);`
	`253`	`+`
	`254`	`+ // remove own peerId from Autonat server list`
	`255`	`+ self.swarm.behaviour_mut().autonat.remove_server(&peer_id);`
`248`	`256`	`let _ = sender.send(());`
`249`	`257`	`}`
`250`	`258`	`}`