rm profiling feature

erhant · erhant · commit 84895b829c5e · 2024-10-07T21:25:02.000+03:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,8 +1,15 @@
 [workspace]
 resolver = "2"
 members = ["compute", "p2p", "workflows"]
+# compute node is the default member, until Oracle comes in
+# then, a Launcher will be the default member
 default-members = ["compute"]
 
+# profiling build for flamegraphs
+[profile.profiling]
+inherits = "release"
+debug = true
+
 [workspace.package]
 edition = "2021"
 license = "Apache-2.0"
diff --git a/Makefile b/Makefile
@@ -27,11 +27,11 @@ build:
 
 .PHONY: profile-cpu #  | Profile CPU usage with flamegraph
 profile-cpu:
-	  cargo flamegraph --root --profile=profiling --features=profiling
+	  DKN_EXIT_TIMEOUT=120 cargo flamegraph --root --profile=profiling
 
 .PHONY: profile-mem #  | Profile memory usage with instruments
 profile-mem:
-	  cargo instruments --profile=profiling --features=profiling -t Allocations
+	  DKN_EXIT_TIMEOUT=120 cargo instruments --profile=profiling -t Allocations
 
 ###############################################################################
 .PHONY: test #         | Run tests
@@ -42,6 +42,7 @@ test:
 .PHONY: lint #         | Run linter (clippy)
 lint:
 		cargo clippy
+		cargo clippy
 
 .PHONY: format #       | Run formatter (cargo fmt)
 format:
diff --git a/compute/.env.example b/compute/.env.example
@@ -0,0 +1,37 @@
+## DRIA (required) ##
+# Secret key of your compute node, 32 byte in hexadecimal.
+# e.g.: DKN_WALLET_SECRET_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80
+DKN_WALLET_SECRET_KEY=
+# Public key of Dria Admin node, 33-byte (compressed) in hexadecimal.
+# You don't need to change this, simply copy and paste it.
+DKN_ADMIN_PUBLIC_KEY=0208ef5e65a9c656a6f92fb2c770d5d5e2ecffe02a6aade19207f75110be6ae658
+# model1,model2,model3,... (comma separated, case-insensitive)
+# example: phi3:3.8b,gpt-4o-mini
+DKN_MODELS=
+
+## DRIA (optional) ##
+# P2P address, you don't need to change this unless this port is already in use.
+DKN_P2P_LISTEN_ADDR=/ip4/0.0.0.0/tcp/4001
+# Comma-separated static relay nodes
+DKN_RELAY_NODES=
+# Comma-separated static bootstrap nodes
+DKN_BOOTSTRAP_NODES=
+
+# PROFILING ONLY: set to a number of seconds to wait before exiting
+# DKN_EXIT_TIMEOUT=
+
+## Open AI (if used, required) ##
+OPENAI_API_KEY=
+
+## Ollama (if used, optional) ##
+# do not change this, it is used by Docker
+OLLAMA_HOST=http://host.docker.internal
+# you can change the port if you would like
+OLLAMA_PORT=11434
+# if "true", automatically pull models from Ollama
+# if "false", you have to download manually
+OLLAMA_AUTO_PULL=true
+
+## Additional Services (optional)
+SERPER_API_KEY=
+JINA_API_KEY=
diff --git a/compute/Cargo.toml b/compute/Cargo.toml
@@ -6,15 +6,6 @@ license.workspace = true
 readme = "README.md"
 authors = ["Erhan Tezcan <erhan@firstbatch.xyz>"]
 
-# profiling build for flamegraphs
-[profile.profiling]
-inherits = "release"
-debug = true
-
-[features]
-# used by flamegraphs & instruments
-profiling = []
-
 [dependencies]
 tokio-util = { version = "0.7.10", features = ["rt"] }
 tokio = { version = "1", features = ["macros", "rt-multi-thread", "signal"] }
diff --git a/compute/src/config.rs b/compute/src/config.rs
@@ -79,14 +79,14 @@ impl DriaComputeNodeConfig {
         let address = to_address(&public_key);
         log::info!("Node Address:     0x{}", hex::encode(address));
 
-        let model_config =
+        let workflows =
             DriaWorkflowsConfig::new_from_csv(&env::var("DKN_MODELS").unwrap_or_default());
         #[cfg(not(test))]
-        if model_config.models.is_empty() {
+        if workflows.models.is_empty() {
             log::error!("No models were provided, make sure to restart with at least one model provided within DKN_MODELS.");
             panic!("No models provided.");
         }
-        log::info!("Models: {:?}", model_config.models);
+        log::info!("Models: {:?}", workflows.models);
 
         let p2p_listen_addr_str = env::var("DKN_P2P_LISTEN_ADDR")
             .map(|addr| addr.trim_matches('"').to_string())
@@ -99,12 +99,13 @@ impl DriaComputeNodeConfig {
             secret_key,
             public_key,
             address,
-            workflows: model_config,
+            workflows,
             p2p_listen_addr,
         }
     }
 
     /// Asserts that the configured listen address is free.
+    /// Throws an error if the address is already in use.
     pub fn assert_address_not_in_use(&self) -> Result<()> {
         if address_in_use(&self.p2p_listen_addr) {
             return Err(eyre!(
diff --git a/compute/src/main.rs b/compute/src/main.rs
@@ -1,16 +1,18 @@
+use std::env;
+
 use dkn_compute::*;
-use eyre::Result;
+use eyre::{Context, Result};
 use tokio_util::sync::CancellationToken;
 
 #[tokio::main]
 async fn main() -> Result<()> {
-    if let Err(e) = dotenvy::dotenv() {
-        log::warn!("Could not load .env file: {}", e);
-    }
-
+    let dotenv_result = dotenvy::dotenv();
     env_logger::builder()
         .format_timestamp(Some(env_logger::TimestampPrecision::Millis))
         .init();
+    if let Err(e) = dotenv_result {
+        log::warn!("Could not load .env file: {}", e);
+    }
 
     log::info!(
         r#"
@@ -26,49 +28,45 @@ async fn main() -> Result<()> {
 
     let token = CancellationToken::new();
     let cancellation_token = token.clone();
-    // add cancellation check
     tokio::spawn(async move {
-        // FIXME: weird feature-gating here bugs with IDE, fix this later
-        #[cfg(feature = "profiling")]
-        {
-            const PROFILE_DURATION_SECS: u64 = 120;
-            tokio::time::sleep(tokio::time::Duration::from_secs(PROFILE_DURATION_SECS)).await;
+        if let Some(timeout_str) = env::var("DKN_EXIT_TIMEOUT").ok() {
+            // add cancellation check
+            let duration_secs = timeout_str.parse().unwrap_or(120);
+            tokio::time::sleep(tokio::time::Duration::from_secs(duration_secs)).await;
             cancellation_token.cancel();
+        } else {
+            if let Err(err) = wait_for_termination(cancellation_token.clone()).await {
+                log::error!("Error waiting for termination: {:?}", err);
+                log::error!("Cancelling due to unexpected error.");
+                cancellation_token.cancel();
+            };
         }
-
-        #[cfg(not(feature = "profiling"))]
-        if let Err(err) = wait_for_termination(cancellation_token.clone()).await {
-            log::error!("Error waiting for termination: {:?}", err);
-            log::error!("Cancelling due to unexpected error.");
-            cancellation_token.cancel();
-        };
     });
 
-    // create configurations & check required services
-    let config = DriaComputeNodeConfig::new();
+    // create configurations & check required services & address in use
+    let mut config = DriaComputeNodeConfig::new();
     config.assert_address_not_in_use()?;
     let service_check_token = token.clone();
-    let mut config_clone = config.clone();
     let service_check_handle = tokio::spawn(async move {
         tokio::select! {
             _ = service_check_token.cancelled() => {
                 log::info!("Service check cancelled.");
+                config
             }
-            result = config_clone.workflows.check_services() => {
+            result = config.workflows.check_services() => {
                 if let Err(err) = result {
                     log::error!("Error checking services: {:?}", err);
                     panic!("Service check failed.")
                 }
+                config
             }
         }
     });
+    let config = service_check_handle
+        .await
+        .wrap_err("error during service checks")?;
 
-    // wait for service check to complete
-    if let Err(err) = service_check_handle.await {
-        log::error!("Service check handle error: {}", err);
-        panic!("Could not exit service check thread handle.");
-    };
-
+    log::warn!("Using models: {:#?}", config.workflows.models);
     if !token.is_cancelled() {
         // launch the node
         let node_token = token.clone();
@@ -97,11 +95,9 @@ async fn main() -> Result<()> {
     Ok(())
 }
 
-// FIXME: remove this `unused` once we have a better way to handle this
 /// Waits for various termination signals, and cancels the given token when the signal is received.
 ///
 /// Handles Unix and Windows [target families](https://doc.rust-lang.org/reference/conditional-compilation.html#target_family).
-#[allow(unused)]
 async fn wait_for_termination(cancellation: CancellationToken) -> Result<()> {
     #[cfg(unix)]
     {
diff --git a/workflows/src/providers/ollama.rs b/workflows/src/providers/ollama.rs
@@ -82,9 +82,10 @@ impl OllamaConfig {
     /// Check if requested models exist in Ollama, and then tests them using a workflow.
     pub async fn check(&self, external_models: Vec<Model>) -> Result<Vec<Model>> {
         log::info!(
-            "Checking Ollama requirements (auto-pull {}, workflow timeout: {}s)",
+            "Checking Ollama requirements (auto-pull {}, timeout: {}s, min tps: {})",
             if self.auto_pull { "on" } else { "off" },
-            self.timeout.as_secs()
+            self.timeout.as_secs(),
+            self.min_tps
         );
 
         let ollama = Ollama::new(&self.host, self.port);