|
| 1 | +use dkn_workflows::{DriaWorkflowsConfig, OllamaConfig}; |
| 2 | +use ollama_workflows::ollama_rs::{ |
| 3 | + generation::{completion::request::GenerationRequest, options::GenerationOptions}, |
| 4 | + Ollama, |
| 5 | +}; |
| 6 | +use sysinfo::{CpuRefreshKind, RefreshKind, System}; |
| 7 | + |
1 | 8 | #[tokio::main]
|
2 | 9 | async fn main() {
|
3 |
| - /////// |
4 |
| - // loop over all Ollama models |
5 |
| - // report TPS to console |
| 10 | + // initialize logger |
| 11 | + env_logger::init(); |
| 12 | + |
| 13 | + let cfg = DriaWorkflowsConfig::new_from_csv("finalend/hermes-3-llama-3.1:8b-q8_0,phi3:14b-medium-4k-instruct-q4_1,phi3:14b-medium-128k-instruct-q4_1,phi3.5:3.8b,phi3.5:3.8b-mini-instruct-fp16,gemma2:9b-instruct-q8_0,gemma2:9b-instruct-fp16,llama3.1:latest,llama3.1:8b-instruct-q8_0,llama3.1:8b-instruct-fp16,llama3.1:70b-instruct-q4_0,llama3.1:70b-instruct-q8_0,llama3.2:1b,llama3.2:3b,qwen2.5:7b-instruct-q5_0,qwen2.5:7b-instruct-fp16,qwen2.5:32b-instruct-fp16,qwen2.5-coder:1.5b,qwen2.5-coder:7b-instruct,llama3.2:3b,qwen2.5-coder:7b-instruct-q8_0,qwen2.5-coder:7b-instruct-fp16,deepseek-coder:6.7b,mixtral:8x7b"); |
| 14 | + let config = OllamaConfig::default(); |
| 15 | + let ollama = Ollama::new(config.host, config.port); |
| 16 | + |
| 17 | + log::info!("Starting..."); |
| 18 | + // ensure that all lists of CPUs and processes are filled |
| 19 | + let mut system = System::new_all(); |
| 20 | + // update all information of the system |
| 21 | + system.refresh_all(); |
| 22 | + |
| 23 | + log::debug!("Getting system information..."); |
| 24 | + let brand = system.cpus()[0].brand().to_string(); |
| 25 | + let os_name = System::name().unwrap_or_else(|| "Unknown".to_string()); |
| 26 | + let os_version = System::long_os_version().unwrap_or_else(|| "Unknown".to_string()); |
| 27 | + let cpu_usage = system.global_cpu_usage(); |
| 28 | + let total_memory = system.total_memory(); |
| 29 | + let used_memory = system.used_memory(); |
| 30 | + |
| 31 | + for (_, model) in cfg.models { |
| 32 | + log::info!("Pulling model: {}", model); |
| 33 | + |
| 34 | + // pull model |
| 35 | + match ollama.pull_model(model.to_string(), false).await { |
| 36 | + Ok(status) => log::info!("Status: {}", status.message), |
| 37 | + Err(err) => { |
| 38 | + log::error!("Failed to pull model {}: {:?}", model, err); |
| 39 | + } |
| 40 | + } |
| 41 | + |
| 42 | + log::debug!("Creating request..."); |
| 43 | + // create dummy request |
| 44 | + let mut generation_request = |
| 45 | + GenerationRequest::new(model.to_string(), "compute 6780 * 1200".to_string()); |
| 46 | + |
| 47 | + if let Ok(num_thread) = std::env::var("OLLAMA_NUM_THREAD") { |
| 48 | + generation_request = generation_request.options( |
| 49 | + GenerationOptions::default().num_thread( |
| 50 | + num_thread |
| 51 | + .parse() |
| 52 | + .expect("num threads should be a positive integer"), |
| 53 | + ), |
| 54 | + ); |
| 55 | + } |
6 | 56 |
|
7 |
| - /////// |
8 |
| - // report machine info here as well, such as OS / CPU / RAM / Ollama version |
| 57 | + // generate response |
| 58 | + match ollama.generate(generation_request).await { |
| 59 | + Ok(response) => { |
| 60 | + log::debug!("Got response for model {}", model); |
| 61 | + // compute TPS |
| 62 | + let tps = (response.eval_count.unwrap_or_default() as f64) |
| 63 | + / (response.eval_duration.unwrap_or(1) as f64) |
| 64 | + * 1_000_000_000f64; |
| 65 | + // report machine info |
| 66 | + log::info!( |
| 67 | + "\n Model: {} \n TPS: {} \n OS: {} {} \n Version: {} \n CPU Usage: % {} \n Total Memory: {} KB \n Used Memory: {} KB ", |
| 68 | + model, |
| 69 | + tps, |
| 70 | + brand, |
| 71 | + os_name, |
| 72 | + os_version, |
| 73 | + cpu_usage, |
| 74 | + total_memory, |
| 75 | + used_memory, |
| 76 | + ); |
| 77 | + } |
| 78 | + Err(e) => { |
| 79 | + log::warn!("Ignoring model {}: Workflow failed with error {}", model, e); |
| 80 | + } |
| 81 | + } |
| 82 | + // refresh CPU usage (https://docs.rs/sysinfo/latest/sysinfo/struct.Cpu.html#method.cpu_usage) |
| 83 | + system = |
| 84 | + System::new_with_specifics(RefreshKind::new().with_cpu(CpuRefreshKind::everything())); |
| 85 | + // wait a bit because CPU usage is based on diff |
| 86 | + std::thread::sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL); |
| 87 | + // refresh CPUs again to get actual value |
| 88 | + system.refresh_cpu_usage(); |
| 89 | + } |
| 90 | + log::info!("Finished"); |
9 | 91 | }
|
0 commit comments