diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..103939e --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,65 @@ +name: CI + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 + +jobs: + check: + name: Check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - run: cargo check --all-targets + + test: + name: Test + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest] + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - run: cargo test --all-targets + + fmt: + name: Format + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt + - run: cargo fmt --all -- --check + + clippy: + name: Clippy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: clippy + - uses: Swatinem/rust-cache@v2 + - run: cargo clippy --all-targets -- -D warnings + + docs: + name: Docs + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - run: cargo doc --no-deps --all-features + env: + RUSTDOCFLAGS: -D warnings diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..21f087b --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,29 @@ +name: Documentation + +on: + push: + branches: [main, develop] + paths: + - 'docs/**' + - 'mkdocs.yml' + workflow_dispatch: + +permissions: + contents: write + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install mkdocs-material mkdocstrings[python] + + - name: Deploy to gh-pages + run: mkdocs gh-deploy --force diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..75d77ab --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,75 @@ +name: Release + +on: + push: + tags: + - 'v*' + +permissions: + contents: write + +jobs: + build: + name: Build ${{ matrix.target }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + - os: ubuntu-latest + target: x86_64-unknown-linux-gnu + artifact: charmer-linux-x86_64 + - os: ubuntu-latest + target: x86_64-unknown-linux-musl + artifact: charmer-linux-x86_64-musl + - os: macos-latest + target: x86_64-apple-darwin + artifact: charmer-macos-x86_64 + - os: macos-latest + target: aarch64-apple-darwin + artifact: charmer-macos-aarch64 + + steps: + - uses: actions/checkout@v4 + + - uses: dtolnay/rust-toolchain@stable + with: + targets: ${{ matrix.target }} + + - name: Install musl tools + if: matrix.target == 'x86_64-unknown-linux-musl' + run: sudo apt-get install -y musl-tools + + - uses: Swatinem/rust-cache@v2 + + - name: Build + run: cargo build --release --target ${{ matrix.target }} + + - name: Package + run: | + cd target/${{ matrix.target }}/release + tar czf ../../../${{ matrix.artifact }}.tar.gz charmer + cd ../../.. + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.artifact }} + path: ${{ matrix.artifact }}.tar.gz + + release: + name: Create Release + needs: build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: Create release + uses: softprops/action-gh-release@v1 + with: + files: artifacts/**/*.tar.gz + generate_release_notes: true diff --git a/.gitignore b/.gitignore index ae8f085..ab14eaf 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,31 @@ Cargo.lock .DS_Store *.swp *.swo + +# Test pipeline outputs +tests/pipelines/*/results/ +tests/pipelines/*/.snakemake/ +tests/pipelines/*/dag.png + +# Pixi +.pixi/ +pixi.lock + +# Documentation +site/ + +# VHS output +docs/assets/*.gif +docs/assets/*.webm +docs/assets/*.mp4 + +# IDE +.idea/ +.vscode/ +*.code-workspace + +# Coverage +*.profraw +*.profdata +coverage/ +lcov.info diff --git a/Cargo.toml b/Cargo.toml index 04d11a8..f8f2384 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,20 +4,21 @@ members = ["crates/*"] [workspace.package] version = "0.1.0" -edition = "2024" +edition = "2021" +rust-version = "1.85" license = "MIT" authors = ["Jay Hesselberth"] [workspace.dependencies] # TUI -ratatui = "0.29" -crossterm = "0.28" +ratatui = "0.30.0-beta.0" +crossterm = "0.29" # Async -tokio = { version = "1.35", features = ["full"] } +tokio = { version = "1.43", features = ["full"] } # File watching -notify = "6.1" +notify = "8.2" # Serialization serde = { version = "1.0", features = ["derive"] } @@ -27,24 +28,29 @@ serde_json = "1.0" chrono = { version = "0.4", features = ["serde"] } # CLI -clap = { version = "4.4", features = ["derive"] } +clap = { version = "4.5", features = ["derive"] } # Paths camino = { version = "1.1", features = ["serde1"] } # Regex for parsing -regex = "1.10" +regex = "1.11" # Base64 for snakemake filenames base64 = "0.22" # Error handling -thiserror = "1.0" -miette = { version = "7.0", features = ["fancy"] } +thiserror = "2.0" +miette = { version = "7.4", features = ["fancy"] } + +# Clipboard +arboard = "3" # Internal crates charmer-core = { path = "crates/charmer-core" } +charmer-parsers = { path = "crates/charmer-parsers" } charmer-slurm = { path = "crates/charmer-slurm" } +charmer-lsf = { path = "crates/charmer-lsf" } charmer-state = { path = "crates/charmer-state" } charmer-monitor = { path = "crates/charmer-monitor" } charmer-cli = { path = "crates/charmer-cli" } diff --git a/README.md b/README.md new file mode 100644 index 0000000..4e9e320 --- /dev/null +++ b/README.md @@ -0,0 +1,141 @@ +# charmer + +A terminal user interface (TUI) for monitoring Snakemake pipelines running on HPC clusters. + +![Charmer Demo](docs/images/demo.gif) + +## Features + +- **Real-time monitoring** of Snakemake pipelines on SLURM and LSF clusters +- **Unified view** merging data from scheduler queries and Snakemake metadata +- **Interactive TUI** with vim-style navigation +- **Filtering & sorting** by job status, rule name, or time +- **Log viewer** for examining job output +- **Cross-platform** support for Linux and macOS + +## Installation + +### From source + +```bash +# Clone the repository +git clone https://github.com/rnabioco/charmer.git +cd charmer + +# Build with cargo +cargo build --release + +# Install to ~/.cargo/bin +cargo install --path crates/charmer +``` + +### Using pixi (for development) + +```bash +pixi install +pixi run build +``` + +## Usage + +```bash +# Monitor current directory +charmer + +# Monitor specific directory +charmer /path/to/pipeline + +# With options +charmer --poll-interval 10 --theme dark /path/to/pipeline +``` + +### CLI Options + +| Option | Default | Description | +|--------|---------|-------------| +| `--poll-interval` | 5 | Seconds between scheduler queries | +| `--run-uuid` | - | Filter to specific Snakemake run | +| `--theme` | dark | Color theme (dark/light) | +| `--history-hours` | 24 | Show completed jobs from last N hours | + +### Keyboard Shortcuts + +| Key | Action | +|-----|--------| +| `j` / `↓` | Move down | +| `k` / `↑` | Move up | +| `g` / `Home` | Go to first job | +| `G` / `End` | Go to last job | +| `f` | Cycle filter (All/Running/Failed/Pending/Completed) | +| `s` | Cycle sort (Status/Rule/Time) | +| `l` / `Enter` | View job logs | +| `F` | Toggle follow mode in logs | +| `?` | Show help | +| `q` / `Ctrl+C` | Quit | + +## Supported Schedulers + +### SLURM + +Charmer queries SLURM using: +- `squeue` for active jobs +- `sacct` for job history + +Jobs are correlated with Snakemake using the comment field format: +`rule_{rulename}_wildcards_{wildcards}` + +### LSF + +Charmer queries LSF using: +- `bjobs` for active jobs +- `bhist` for job history + +## How It Works + +Charmer combines data from multiple sources: + +1. **Snakemake metadata** (`.snakemake/metadata/`) - Job inputs, outputs, shell commands +2. **Scheduler queries** - Job status, resource usage, timing +3. **Log files** (`.snakemake/slurm_logs/`) - Job output and errors + +Data is merged using rule names and timing windows to correlate jobs across sources. + +## Development + +```bash +# Install dependencies +pixi install + +# Run tests +cargo test + +# Run with debug logging +RUST_LOG=debug cargo run -- . + +# Build documentation +pixi run docs +``` + +### Project Structure + +``` +charmer/ +├── crates/ +│ ├── charmer/ # Main binary +│ ├── charmer-cli/ # CLI argument parsing +│ ├── charmer-core/ # Snakemake metadata parsing +│ ├── charmer-slurm/ # SLURM integration +│ ├── charmer-lsf/ # LSF integration +│ ├── charmer-state/ # Unified job state +│ └── charmer-monitor/ # TUI components +├── docs/ # Documentation +└── tests/ # Integration tests +``` + +## License + +MIT License - see [LICENSE](LICENSE) for details. + +## Contributing + +Contributions welcome! Please read our [contributing guide](CONTRIBUTING.md) first. diff --git a/crates/charmer-core/src/lib.rs b/crates/charmer-core/src/lib.rs index 99305a3..7a8651b 100644 --- a/crates/charmer-core/src/lib.rs +++ b/crates/charmer-core/src/lib.rs @@ -1,7 +1,13 @@ //! Snakemake metadata parsing for charmer. //! -//! This crate handles parsing of `.snakemake/metadata/` files. +//! This crate handles parsing of `.snakemake/metadata/` files +//! and the main snakemake log file. +pub mod main_log; pub mod metadata; -pub use metadata::{SnakemakeJob, SnakemakeMetadata}; +pub use main_log::{find_latest_log, parse_log_file, parse_main_log, SnakemakeLogInfo}; +pub use metadata::{ + decode_metadata_filename, parse_metadata_file, scan_metadata_dir, MetadataError, SnakemakeJob, + SnakemakeMetadata, +}; diff --git a/crates/charmer-core/src/main_log.rs b/crates/charmer-core/src/main_log.rs new file mode 100644 index 0000000..8700bae --- /dev/null +++ b/crates/charmer-core/src/main_log.rs @@ -0,0 +1,352 @@ +//! Parser for main snakemake log file (.snakemake/log/*.snakemake.log). +//! +//! Extracts pipeline-level information like total job count and progress. + +use camino::Utf8Path; +use std::collections::{HashMap, HashSet}; +use std::fs; +use std::io; + +/// Information parsed from the main snakemake log. +#[derive(Debug, Clone, Default)] +pub struct SnakemakeLogInfo { + /// Total number of jobs in the pipeline + pub total_jobs: Option, + /// Number of completed jobs + pub completed_jobs: usize, + /// Job counts per rule + pub jobs_by_rule: HashMap, + /// Rules that have no output files (target rules like "all") + /// These rules don't create metadata files and need synthetic job entries + pub target_rules: HashSet, + /// Number of cores being used + pub cores: Option, + /// Host machine name + pub host: Option, + /// Whether the pipeline has finished + pub finished: bool, + /// Whether there were errors + pub has_errors: bool, + /// Error messages found + pub errors: Vec, +} + +impl SnakemakeLogInfo { + /// Get progress as a fraction (0.0 to 1.0). + pub fn progress(&self) -> f64 { + match self.total_jobs { + Some(total) if total > 0 => self.completed_jobs as f64 / total as f64, + _ => 0.0, + } + } + + /// Get progress as a percentage string. + pub fn progress_percent(&self) -> String { + format!("{:.0}%", self.progress() * 100.0) + } +} + +/// Find the most recent snakemake log file in the working directory. +pub fn find_latest_log(working_dir: &Utf8Path) -> Option { + let log_dir = working_dir.join(".snakemake").join("log"); + if !log_dir.exists() { + return None; + } + + let mut latest: Option<(std::time::SystemTime, camino::Utf8PathBuf)> = None; + + if let Ok(entries) = fs::read_dir(&log_dir) { + for entry in entries.flatten() { + let path = entry.path(); + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + if name.ends_with(".snakemake.log") { + if let Ok(metadata) = entry.metadata() { + if let Ok(modified) = metadata.modified() { + if let Ok(utf8_path) = camino::Utf8PathBuf::try_from(path) { + if latest.is_none() || modified > latest.as_ref().unwrap().0 { + latest = Some((modified, utf8_path)); + } + } + } + } + } + } + } + } + + latest.map(|(_, path)| path) +} + +/// Parse the main snakemake log file. +pub fn parse_main_log(working_dir: &Utf8Path) -> io::Result { + let log_path = find_latest_log(working_dir) + .ok_or_else(|| io::Error::new(io::ErrorKind::NotFound, "No snakemake log file found"))?; + + parse_log_file(&log_path) +} + +/// Parse a specific snakemake log file. +pub fn parse_log_file(path: &Utf8Path) -> io::Result { + let content = fs::read_to_string(path)?; + Ok(parse_log_content(&content)) +} + +/// Parse snakemake log content. +pub fn parse_log_content(content: &str) -> SnakemakeLogInfo { + let mut info = SnakemakeLogInfo::default(); + let mut in_job_stats = false; + + // State for tracking rule blocks to identify target rules (rules without outputs) + let mut current_rule: Option = None; + let mut current_rule_has_output = false; + // Track all rules we've seen with their output status + let mut rules_with_outputs: HashSet = HashSet::new(); + let mut all_seen_rules: HashSet = HashSet::new(); + + for line in content.lines() { + let line = line.trim(); + + // Detect rule block start: "localrule X:" or "rule X:" + // This helps us identify target rules (rules without output files) + if (line.starts_with("localrule ") || line.starts_with("rule ")) + && line.ends_with(':') + && !line.contains("(Rule:") + { + // Save previous rule's output status + if let Some(ref rule) = current_rule { + all_seen_rules.insert(rule.clone()); + if current_rule_has_output { + rules_with_outputs.insert(rule.clone()); + } + } + + // Extract rule name: "localrule X:" or "rule X:" -> "X" + let rule_part = line + .trim_start_matches("localrule ") + .trim_start_matches("rule "); + let rule_name = rule_part.trim_end_matches(':').to_string(); + current_rule = Some(rule_name); + current_rule_has_output = false; + continue; + } + + // Track if current rule has an output line + if current_rule.is_some() && line.starts_with("output:") { + current_rule_has_output = true; + } + + // End of rule block detection: timestamp line or certain keywords + if current_rule.is_some() + && (line.starts_with('[') || line.starts_with("Select jobs") || line.is_empty()) + { + if let Some(ref rule) = current_rule { + all_seen_rules.insert(rule.clone()); + if current_rule_has_output { + rules_with_outputs.insert(rule.clone()); + } + } + current_rule = None; + current_rule_has_output = false; + } + + // Parse host + if line.starts_with("host:") { + info.host = Some(line.trim_start_matches("host:").trim().to_string()); + continue; + } + + // Parse cores + if line.starts_with("Provided cores:") { + if let Some(cores_str) = line.strip_prefix("Provided cores:") { + info.cores = cores_str.trim().parse().ok(); + } + continue; + } + + // Detect job stats section + if line == "Job stats:" { + in_job_stats = true; + continue; + } + + // Parse job stats table + if in_job_stats { + // End of table (empty line or next section) + if line.is_empty() || line.starts_with("Select jobs") { + in_job_stats = false; + continue; + } + + // Skip header line + if line.starts_with("job") || line.starts_with("---") { + continue; + } + + // Parse "rule_name count" lines + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 { + let rule = parts[0]; + if let Ok(count) = parts[parts.len() - 1].parse::() { + if rule == "total" { + info.total_jobs = Some(count); + } else { + info.jobs_by_rule.insert(rule.to_string(), count); + } + } + } + continue; + } + + // Parse progress: "X of Y steps (Z%) done" + if line.contains(" of ") && line.contains(" steps") && line.contains("done") { + // Extract "X of Y" + if let Some(of_idx) = line.find(" of ") { + let before = &line[..of_idx]; + // Find the last number before " of " + let completed: Option = before + .split_whitespace() + .last() + .and_then(|s| s.parse().ok()); + + if let Some(c) = completed { + info.completed_jobs = c; + } + } + continue; + } + + // Detect completion + if line.contains("steps (100%) done") || line.contains("Nothing to be done") { + info.finished = true; + continue; + } + + // Detect errors + if line.starts_with("Error") || line.contains("error:") || line.contains("Exception") { + info.has_errors = true; + if line.len() < 200 { + info.errors.push(line.to_string()); + } + } + + // Specific error patterns + if line.contains("Exiting because a job execution failed") { + info.has_errors = true; + info.errors.push(line.to_string()); + } + } + + // Handle any remaining rule block at end of file + if let Some(ref rule) = current_rule { + all_seen_rules.insert(rule.clone()); + if current_rule_has_output { + rules_with_outputs.insert(rule.clone()); + } + } + + // Target rules are those we've seen in rule blocks that have no outputs + // These are rules like "all" that just aggregate other targets + info.target_rules = all_seen_rules + .difference(&rules_with_outputs) + .cloned() + .collect(); + + info +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_job_stats() { + let content = r#" +Building DAG of jobs... +Job stats: +job count +--------------------- ------- +align_sample 4 +process_sample 4 +total 8 + +Select jobs to execute... +"#; + let info = parse_log_content(content); + assert_eq!(info.total_jobs, Some(8)); + assert_eq!(info.jobs_by_rule.get("align_sample"), Some(&4)); + assert_eq!(info.jobs_by_rule.get("process_sample"), Some(&4)); + } + + #[test] + fn test_parse_progress() { + let content = r#" +[Thu Dec 18 12:24:21 2025] +Finished jobid: 22 (Rule: call_variants) +5 of 27 steps (19%) done +"#; + let info = parse_log_content(content); + assert_eq!(info.completed_jobs, 5); + } + + #[test] + fn test_parse_cores() { + let content = "Provided cores: 4\n"; + let info = parse_log_content(content); + assert_eq!(info.cores, Some(4)); + } + + #[test] + fn test_parse_host() { + let content = "host: myserver\n"; + let info = parse_log_content(content); + assert_eq!(info.host, Some("myserver".to_string())); + } + + #[test] + fn test_parse_target_rules() { + // "all" rule has no output - it's a target rule + // "final_merge" rule has output - it's NOT a target rule + let content = r#" +[Thu Dec 18 16:20:31 2025] +localrule final_merge: + input: results/merged/sample1_merged.vcf + output: results/all_variants.vcf + log: logs/final_merge.log + jobid: 2 + reason: Missing output files: results/all_variants.vcf + +[Thu Dec 18 16:20:47 2025] +localrule all: + input: results/final_report.txt + jobid: 0 + reason: Input files updated by another job: results/final_report.txt + resources: tmpdir=/tmp +"#; + let info = parse_log_content(content); + // "all" should be a target rule (no output) + assert!(info.target_rules.contains("all")); + // "final_merge" should NOT be a target rule (has output) + assert!(!info.target_rules.contains("final_merge")); + } + + #[test] + fn test_parse_target_rules_with_regular_rule() { + let content = r#" +[Thu Dec 18 16:17:43 2025] +rule call_variants: + input: results/aligned/sample6.bam + output: results/variants/sample6_chr2.vcf + log: logs/call_variants/sample6_chr2.log + jobid: 35 + wildcards: sample=sample6, chrom=chr2 + +[Thu Dec 18 16:20:47 2025] +localrule all: + input: results/final_report.txt + jobid: 0 +"#; + let info = parse_log_content(content); + assert!(info.target_rules.contains("all")); + assert!(!info.target_rules.contains("call_variants")); + } +} diff --git a/crates/charmer-core/src/metadata.rs b/crates/charmer-core/src/metadata.rs index dd3e9b3..fdb91ab 100644 --- a/crates/charmer-core/src/metadata.rs +++ b/crates/charmer-core/src/metadata.rs @@ -1,7 +1,10 @@ //! Snakemake metadata types and parsing. +use base64::prelude::*; +use camino::{Utf8Path, Utf8PathBuf}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use thiserror::Error; /// Snakemake job metadata from .snakemake/metadata/{base64_output_file} #[derive(Debug, Clone, Deserialize, Serialize)] @@ -59,3 +62,119 @@ pub struct SnakemakeJob { /// Metadata from JSON pub metadata: SnakemakeMetadata, } + +#[derive(Error, Debug)] +pub enum MetadataError { + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + #[error("JSON parse error: {0}")] + Json(#[from] serde_json::Error), + #[error("Base64 decode error: {0}")] + Base64(#[from] base64::DecodeError), + #[error("UTF-8 decode error: {0}")] + Utf8(#[from] std::string::FromUtf8Error), + #[error("Metadata directory not found: {0}")] + NotFound(Utf8PathBuf), +} + +/// Decode a base64-encoded filename to get the output path. +pub fn decode_metadata_filename(filename: &str) -> Result { + let bytes = BASE64_STANDARD.decode(filename)?; + let path = String::from_utf8(bytes)?; + Ok(path) +} + +/// Parse a single metadata file. +pub fn parse_metadata_file(path: &Utf8Path) -> Result { + let content = std::fs::read_to_string(path)?; + let metadata: SnakemakeMetadata = serde_json::from_str(&content)?; + + // Decode the output path from the filename + let filename = path + .file_name() + .ok_or_else(|| MetadataError::NotFound(path.to_owned()))?; + let output_path = decode_metadata_filename(filename)?; + + Ok(SnakemakeJob { + output_path, + metadata, + }) +} + +/// Scan the .snakemake/metadata directory and parse all metadata files. +pub fn scan_metadata_dir(working_dir: &Utf8Path) -> Result, MetadataError> { + let metadata_dir = working_dir.join(".snakemake").join("metadata"); + + if !metadata_dir.exists() { + return Ok(vec![]); + } + + let mut jobs = Vec::new(); + + for entry in std::fs::read_dir(&metadata_dir)? { + let entry = entry?; + let path = Utf8PathBuf::try_from(entry.path()).map_err(|e| { + MetadataError::Io(std::io::Error::new( + std::io::ErrorKind::InvalidData, + e.to_string(), + )) + })?; + + // Skip directories and non-files + if !path.is_file() { + continue; + } + + // Skip hidden files + if path + .file_name() + .map(|n| n.starts_with('.')) + .unwrap_or(false) + { + continue; + } + + match parse_metadata_file(&path) { + Ok(job) => jobs.push(job), + Err(e) => { + // Log but continue on parse errors + eprintln!("Warning: Failed to parse metadata file {}: {}", path, e); + } + } + } + + Ok(jobs) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_decode_metadata_filename() { + // "results/test.txt" in base64 + let encoded = BASE64_STANDARD.encode("results/test.txt"); + let decoded = decode_metadata_filename(&encoded).unwrap(); + assert_eq!(decoded, "results/test.txt"); + } + + #[test] + fn test_parse_metadata() { + let json = r#"{ + "rule": "test_rule", + "input": ["input.txt"], + "log": ["log.txt"], + "params": [], + "shellcmd": "echo hello", + "incomplete": false, + "starttime": 1700000000.0, + "endtime": 1700000100.0, + "job_hash": 12345 + }"#; + + let meta: SnakemakeMetadata = serde_json::from_str(json).unwrap(); + assert_eq!(meta.rule, "test_rule"); + assert!(!meta.incomplete); + assert_eq!(meta.endtime, Some(1700000100.0)); + } +} diff --git a/crates/charmer-lsf/Cargo.toml b/crates/charmer-lsf/Cargo.toml new file mode 100644 index 0000000..c022fc5 --- /dev/null +++ b/crates/charmer-lsf/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "charmer-lsf" +version.workspace = true +edition.workspace = true +license.workspace = true + +[dependencies] +charmer-parsers.workspace = true +chrono.workspace = true +thiserror.workspace = true +tokio.workspace = true diff --git a/crates/charmer-lsf/src/bhist.rs b/crates/charmer-lsf/src/bhist.rs new file mode 100644 index 0000000..43ff80e --- /dev/null +++ b/crates/charmer-lsf/src/bhist.rs @@ -0,0 +1,162 @@ +//! Query LSF job history via bhist. + +use crate::types::{LsfJob, LsfJobState}; +use charmer_parsers::{parse_memory_mb, run_command_allow_failure, MemoryFormat}; +use chrono::{DateTime, Utc}; +use std::time::Duration; +use thiserror::Error; +use tokio::process::Command; + +#[derive(Error, Debug)] +pub enum BhistError { + #[error("Failed to execute bhist: {0}")] + ExecutionError(String), + #[error("Failed to parse bhist output: {0}")] + ParseError(String), +} + +/// Query job history with bhist. +/// Note: bhist output format varies by LSF version, this is a basic implementation. +pub async fn query_bhist( + job_name_filter: Option<&str>, + since: Option>, +) -> Result, BhistError> { + let mut cmd = Command::new("bhist"); + cmd.args(["-a", "-l"]); // All jobs, long format + + // Filter by job name if specified + if let Some(name) = job_name_filter { + cmd.args(["-J", name]); + } + + let stdout = run_command_allow_failure(&mut cmd, "bhist") + .await + .map_err(|e| BhistError::ExecutionError(e.to_string()))?; + + // bhist -l output is complex multi-line format, parse job blocks + parse_bhist_long_output(&stdout, since) +} + +/// Parse bhist -l (long format) output. +/// Jobs are separated by dashed lines and contain structured info. +fn parse_bhist_long_output( + output: &str, + since: Option>, +) -> Result, BhistError> { + let mut jobs = Vec::new(); + let mut current_job: Option = None; + + for line in output.lines() { + let line = line.trim(); + + // Job header line: "Job <12345>, ..." + if line.starts_with("Job <") { + // Save previous job if exists + if let Some(job) = current_job.take() { + // Filter by time if specified + if let Some(since_time) = since { + if job.submit_time.map(|t| t >= since_time).unwrap_or(true) { + jobs.push(job); + } + } else { + jobs.push(job); + } + } + + // Parse job ID + if let Some(end) = line.find(">,") { + let job_id = line[5..end].to_string(); + current_job = Some(LsfJob { + job_id, + name: String::new(), + state: LsfJobState::Unknown("UNKNOWN".to_string()), + queue: None, + submit_time: None, + start_time: None, + end_time: None, + exec_host: None, + nprocs: None, + mem_limit_mb: None, + mem_used_mb: None, + run_limit: None, + description: None, + }); + } + } + + // Parse job details from current job + if let Some(ref mut job) = current_job { + if line.contains("Job Name <") { + if let (Some(start), Some(end)) = (line.find("Job Name <"), line.rfind(">")) { + job.name = line[start + 10..end].to_string(); + } + } + if line.contains("Queue <") { + if let (Some(start), Some(end)) = (line.find("Queue <"), line.find(">,")) { + job.queue = Some(line[start + 7..end].to_string()); + } + } + if line.starts_with("Submitted from") || line.contains("submitted from") { + // Parse submit time from context - LSF format varies + } + if line.contains("Started on") { + if let Some(host_start) = line.find("Started on <") { + if let Some(host_end) = line[host_start..].find(">,") { + job.exec_host = + Some(line[host_start + 12..host_start + host_end].to_string()); + } + } + } + if line.contains("Done successfully") { + job.state = LsfJobState::Done { + exit_code: 0, + runtime: Duration::ZERO, + }; + } + if line.contains("Exited with exit code") { + let exit_code = line + .split("exit code") + .nth(1) + .and_then(|s| s.trim().trim_end_matches('.').parse().ok()) + .unwrap_or(1); + job.state = LsfJobState::Exit { + exit_code, + error: String::new(), + }; + } + if line.contains("MAX MEM:") { + if let Some(mem_str) = line.split("MAX MEM:").nth(1) { + let mem_part = mem_str.trim().split(';').next().unwrap_or(""); + job.mem_used_mb = parse_memory_mb(mem_part, MemoryFormat::Lsf); + } + } + } + } + + // Don't forget last job + if let Some(job) = current_job { + if let Some(since_time) = since { + if job.submit_time.map(|t| t >= since_time).unwrap_or(true) { + jobs.push(job); + } + } else { + jobs.push(job); + } + } + + Ok(jobs) +} + +#[cfg(test)] +mod tests { + use super::*; + use charmer_parsers::parse_duration; + + #[test] + fn test_parse_runtime() { + assert_eq!(parse_duration("1:30:00"), Some(Duration::from_secs(5400))); + assert_eq!(parse_duration("30:00"), Some(Duration::from_secs(1800))); + assert_eq!(parse_duration("3600"), Some(Duration::from_secs(3600))); + assert!(parse_duration("-").is_none()); + } +} diff --git a/crates/charmer-lsf/src/bjobs.rs b/crates/charmer-lsf/src/bjobs.rs new file mode 100644 index 0000000..ff93c59 --- /dev/null +++ b/crates/charmer-lsf/src/bjobs.rs @@ -0,0 +1,126 @@ +//! Query active LSF jobs via bjobs. + +use crate::types::{LsfJob, LsfJobState}; +use charmer_parsers::{ + non_empty_string, parse_lsf_timestamp, parse_memory_mb, run_command_allow_failure, + split_delimited, MemoryFormat, +}; +use std::time::Duration; +use thiserror::Error; +use tokio::process::Command; + +#[derive(Error, Debug)] +pub enum BjobsError { + #[error("Failed to execute bjobs: {0}")] + ExecutionError(String), + #[error("Failed to parse bjobs output: {0}")] + ParseError(String), +} + +/// bjobs output format (using -o with delimiter) +/// JOBID STAT QUEUE SUBMIT_TIME START_TIME FINISH_TIME EXEC_HOST NPROCS MEMLIMIT JOB_DESCRIPTION +const BJOBS_FORMAT: &str = "jobid stat queue submit_time start_time finish_time exec_host nprocs memlimit job_description delimiter='|'"; + +/// Parse LSF state string. +fn parse_state(s: &str) -> LsfJobState { + match s.to_uppercase().as_str() { + "PEND" => LsfJobState::Pending, + "RUN" => LsfJobState::Running, + "DONE" => LsfJobState::Done { + exit_code: 0, + runtime: Duration::ZERO, + }, + "EXIT" => LsfJobState::Exit { + exit_code: 1, + error: String::new(), + }, + "PSUSP" => LsfJobState::UserSuspendedPending, + "USUSP" => LsfJobState::UserSuspended, + "SSUSP" => LsfJobState::SystemSuspended, + "ZOMBI" => LsfJobState::Zombie, + other => LsfJobState::Unknown(other.to_string()), + } +} + +/// Parse a single line of bjobs output. +fn parse_bjobs_line(line: &str) -> Result { + let fields = split_delimited(line, 10).map_err(BjobsError::ParseError)?; + + Ok(LsfJob { + job_id: fields[0].trim().to_string(), + name: String::new(), // bjobs doesn't include name in this format + state: parse_state(fields[1].trim()), + queue: non_empty_string(fields[2]), + submit_time: parse_lsf_timestamp(fields[3].trim()), + start_time: parse_lsf_timestamp(fields[4].trim()), + end_time: parse_lsf_timestamp(fields[5].trim()), + exec_host: non_empty_string(fields[6]), + nprocs: fields[7].trim().parse().ok(), + mem_limit_mb: parse_memory_mb(fields[8].trim(), MemoryFormat::Lsf), + mem_used_mb: None, + run_limit: None, + description: non_empty_string(fields[9]), + }) +} + +/// Query active jobs with bjobs. +pub async fn query_bjobs(job_name_filter: Option<&str>) -> Result, BjobsError> { + let mut cmd = Command::new("bjobs"); + cmd.args(["-o", BJOBS_FORMAT, "-noheader"]); + + // Filter by job name if specified + if let Some(name) = job_name_filter { + cmd.args(["-J", name]); + } + + // bjobs returns non-zero if no jobs found, which is OK + let stdout = run_command_allow_failure(&mut cmd, "bjobs") + .await + .map_err(|e| BjobsError::ExecutionError(e.to_string()))?; + + let mut jobs = Vec::new(); + + for line in stdout.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with("No ") { + continue; + } + match parse_bjobs_line(line) { + Ok(job) => jobs.push(job), + Err(e) => eprintln!("Warning: {}", e), + } + } + + Ok(jobs) +} + +#[cfg(test)] +mod tests { + use super::*; + use charmer_parsers::parse_lsf_timestamp; + + #[test] + fn test_parse_state() { + assert_eq!(parse_state("PEND"), LsfJobState::Pending); + assert_eq!(parse_state("RUN"), LsfJobState::Running); + assert!(matches!(parse_state("DONE"), LsfJobState::Done { .. })); + assert!(matches!(parse_state("EXIT"), LsfJobState::Exit { .. })); + } + + #[test] + fn test_parse_memory() { + assert_eq!(parse_memory_mb("4 GB", MemoryFormat::Lsf), Some(4096)); + assert_eq!(parse_memory_mb("1000 MB", MemoryFormat::Lsf), Some(1000)); + assert_eq!(parse_memory_mb("1000", MemoryFormat::Lsf), Some(1000)); + assert!(parse_memory_mb("-", MemoryFormat::Lsf).is_none()); + } + + #[test] + fn test_parse_lsf_time() { + // With year + let dt = parse_lsf_timestamp("Dec 18 10:30 2024").unwrap(); + assert_eq!(dt.format("%Y-%m-%d").to_string(), "2024-12-18"); + + assert!(parse_lsf_timestamp("-").is_none()); + } +} diff --git a/crates/charmer-lsf/src/failure.rs b/crates/charmer-lsf/src/failure.rs new file mode 100644 index 0000000..23e5abf --- /dev/null +++ b/crates/charmer-lsf/src/failure.rs @@ -0,0 +1,383 @@ +//! LSF job failure analysis. +//! +//! Query detailed failure information and provide actionable suggestions. + +use charmer_parsers::run_command_allow_failure; +use thiserror::Error; +use tokio::process::Command; + +#[derive(Error, Debug)] +pub enum FailureError { + #[error("Failed to execute bhist: {0}")] + ExecutionError(String), + #[error("Job not found: {0}")] + NotFound(String), + #[error("Parse error: {0}")] + ParseError(String), +} + +/// Failure mode classification for LSF. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FailureMode { + /// Job ran out of memory + OutOfMemory { + used_mb: u64, + limit_mb: u64, + suggested_mb: u64, + }, + /// Job exceeded time limit + Timeout { + elapsed_seconds: u64, + limit_seconds: u64, + suggested_seconds: u64, + }, + /// Job failed with non-zero exit code + ExitCode { code: i32, signal: Option }, + /// Job was killed by user or admin + Killed { by_user: Option }, + /// Host/node failure + HostFailure { host: Option }, + /// Unknown failure mode + Unknown { term_reason: String }, +} + +/// Detailed failure analysis result for LSF. +#[derive(Debug, Clone)] +pub struct FailureAnalysis { + /// LSF job ID + pub job_id: String, + /// Classified failure mode + pub mode: FailureMode, + /// Human-readable explanation + pub explanation: String, + /// Suggested fix + pub suggestion: String, + /// Raw termination reason + pub term_reason: String, + /// Actual memory used (MB) + pub max_mem_mb: Option, + /// Memory limit (MB) + pub mem_limit_mb: Option, + /// Actual runtime (seconds) + pub run_time_seconds: Option, + /// Time limit (seconds) + pub run_limit_seconds: Option, +} + +impl FailureAnalysis { + /// Generate explanation and suggestion based on failure mode. + fn generate_messages(mode: &FailureMode) -> (String, String) { + match mode { + FailureMode::OutOfMemory { + used_mb, + limit_mb, + suggested_mb, + } => { + let explanation = format!( + "Job exceeded memory limit. Used {:.1} GB but limit was {:.1} GB.", + *used_mb as f64 / 1024.0, + *limit_mb as f64 / 1024.0 + ); + let suggestion = format!( + "Increase memory to at least {:.1} GB. In your Snakefile, add:\n resources: mem_mb={}", + *suggested_mb as f64 / 1024.0, + suggested_mb + ); + (explanation, suggestion) + } + FailureMode::Timeout { + elapsed_seconds, + limit_seconds, + suggested_seconds, + } => { + let explanation = format!( + "Job exceeded time limit. Ran for {} but limit was {}.", + format_duration(*elapsed_seconds), + format_duration(*limit_seconds) + ); + let suggestion = format!( + "Increase time limit to at least {}. In your Snakefile, add:\n resources: runtime=\"{}\"", + format_duration(*suggested_seconds), + format_duration_lsf(*suggested_seconds) + ); + (explanation, suggestion) + } + FailureMode::ExitCode { code, signal } => { + let explanation = if let Some(sig) = signal { + format!("Job exited with code {} and signal {}", code, sig) + } else { + match code { + 1 => "Job failed with exit code 1 (general error)".to_string(), + 137 => { + "Job killed (likely OOM). Exit code 137 = 128 + 9 (SIGKILL)".to_string() + } + _ => format!("Job failed with exit code {}", code), + } + }; + let suggestion = if *code == 137 { + "This is likely an out-of-memory error. Try increasing memory allocation." + .to_string() + } else { + "Check the job's stderr log for error details.".to_string() + }; + (explanation, suggestion) + } + FailureMode::Killed { by_user } => { + let explanation = if let Some(user) = by_user { + format!("Job was killed by {}", user) + } else { + "Job was killed".to_string() + }; + ( + "Consider if this was intentional or due to dependency failure.".to_string(), + explanation, + ) + } + FailureMode::HostFailure { host } => { + let explanation = if let Some(h) = host { + format!("Job failed due to host {} failure", h) + } else { + "Job failed due to host failure".to_string() + }; + ( + "Re-run the job. If persistent, contact cluster admin.".to_string(), + explanation, + ) + } + FailureMode::Unknown { term_reason } => ( + format!("Job failed: {}", term_reason), + "Check LSF logs for details.".to_string(), + ), + } + } +} + +/// Query detailed failure information for an LSF job. +pub async fn analyze_failure(job_id: &str) -> Result { + // Use bhist -l to get detailed job history including termination info + let mut cmd = Command::new("bhist"); + cmd.args(["-l", job_id]); + + let stdout = run_command_allow_failure(&mut cmd, "bhist") + .await + .map_err(|e| FailureError::ExecutionError(e.to_string()))?; + + if stdout.contains("No matching job found") || stdout.is_empty() { + return Err(FailureError::NotFound(job_id.to_string())); + } + + parse_bhist_output(job_id, &stdout) +} + +/// Parse bhist -l output for failure analysis. +fn parse_bhist_output(job_id: &str, output: &str) -> Result { + let mut term_reason = String::new(); + let mut max_mem_mb: Option = None; + let mut mem_limit_mb: Option = None; + let mut run_time_seconds: Option = None; + let mut run_limit_seconds: Option = None; + let mut exit_code: Option = None; + + for line in output.lines() { + let line = line.trim(); + + // Look for termination reason + if line.contains("Exited with exit code") { + if let Some(code) = extract_number(line, "exit code") { + exit_code = Some(code as i32); + } + term_reason = line.to_string(); + } else if line.contains("TERM_") { + term_reason = line.to_string(); + } + + // Look for memory info + if line.contains("MAX MEM:") { + max_mem_mb = parse_lsf_memory(line, "MAX MEM:"); + } + if line.contains("MEMLIMIT") || line.contains("MEM LIMIT:") { + mem_limit_mb = + parse_lsf_memory(line, "MEMLIMIT").or_else(|| parse_lsf_memory(line, "MEM LIMIT:")); + } + + // Look for runtime info + if line.contains("Run time:") || line.contains("RUN_TIME:") { + run_time_seconds = parse_lsf_time(line); + } + if line.contains("RUNLIMIT") || line.contains("RUN LIMIT:") { + run_limit_seconds = parse_lsf_time(line); + } + } + + // Determine failure mode + let mode = if term_reason.contains("TERM_MEMLIMIT") { + let used = max_mem_mb.unwrap_or(0); + let limit = mem_limit_mb.unwrap_or(0); + let suggested = ((used as f64 * 1.5) / 1024.0).ceil() as u64 * 1024; + FailureMode::OutOfMemory { + used_mb: used, + limit_mb: limit, + suggested_mb: suggested.max(limit + 1024), + } + } else if term_reason.contains("TERM_RUNLIMIT") { + let elapsed = run_time_seconds.unwrap_or(0); + let limit = run_limit_seconds.unwrap_or(0); + let suggested = (elapsed as f64 * 1.5) as u64; + FailureMode::Timeout { + elapsed_seconds: elapsed, + limit_seconds: limit, + suggested_seconds: suggested.max(limit + 3600), + } + } else if term_reason.contains("TERM_OWNER") || term_reason.contains("TERM_ADMIN") { + FailureMode::Killed { by_user: None } + } else if term_reason.contains("TERM_HOST") || term_reason.contains("TERM_LOAD") { + FailureMode::HostFailure { host: None } + } else if let Some(code) = exit_code { + if code == 137 { + let used = max_mem_mb.unwrap_or(0); + let limit = mem_limit_mb.unwrap_or(0); + let suggested = ((used as f64 * 1.5) / 1024.0).ceil() as u64 * 1024; + FailureMode::OutOfMemory { + used_mb: used, + limit_mb: limit, + suggested_mb: suggested.max(limit + 1024), + } + } else { + FailureMode::ExitCode { code, signal: None } + } + } else if !term_reason.is_empty() { + FailureMode::Unknown { + term_reason: term_reason.clone(), + } + } else { + FailureMode::Unknown { + term_reason: "Unknown failure".to_string(), + } + }; + + let (explanation, suggestion) = FailureAnalysis::generate_messages(&mode); + + Ok(FailureAnalysis { + job_id: job_id.to_string(), + mode, + explanation, + suggestion, + term_reason, + max_mem_mb, + mem_limit_mb, + run_time_seconds, + run_limit_seconds, + }) +} + +/// Extract a number after a given prefix. +fn extract_number(s: &str, prefix: &str) -> Option { + if let Some(idx) = s.find(prefix) { + let after = &s[idx + prefix.len()..]; + let num_str: String = after + .chars() + .skip_while(|c| !c.is_ascii_digit()) + .take_while(|c| c.is_ascii_digit()) + .collect(); + num_str.parse().ok() + } else { + None + } +} + +/// Parse LSF memory value (e.g., "4.5 Gbytes", "1024 Mbytes"). +fn parse_lsf_memory(line: &str, prefix: &str) -> Option { + if let Some(idx) = line.find(prefix) { + let after = &line[idx + prefix.len()..]; + // Find the number + let parts: Vec<&str> = after.split_whitespace().collect(); + if parts.len() >= 2 { + let value: f64 = parts[0].parse().ok()?; + let unit = parts[1].to_lowercase(); + return Some(if unit.starts_with('g') { + (value * 1024.0) as u64 + } else if unit.starts_with('m') { + value as u64 + } else if unit.starts_with('k') { + (value / 1024.0) as u64 + } else { + value as u64 + }); + } + } + None +} + +/// Parse LSF time value (e.g., "01:30:00", "1800 seconds"). +fn parse_lsf_time(line: &str) -> Option { + // Look for HH:MM:SS pattern + for word in line.split_whitespace() { + if word.contains(':') { + let parts: Vec<&str> = word.split(':').collect(); + if parts.len() == 3 { + let hours: u64 = parts[0].parse().ok()?; + let mins: u64 = parts[1].parse().ok()?; + let secs: u64 = parts[2].parse().ok()?; + return Some(hours * 3600 + mins * 60 + secs); + } + } + } + // Look for "N seconds" pattern + if let Some(idx) = line.find("seconds") { + let before = line[..idx].trim(); + // Get the last number token before "seconds" + if let Some(num_str) = before.split_whitespace().last() { + if let Ok(secs) = num_str.parse::() { + return Some(secs as u64); + } + } + } + None +} + +/// Format seconds as human-readable duration. +fn format_duration(seconds: u64) -> String { + let hours = seconds / 3600; + let mins = (seconds % 3600) / 60; + let secs = seconds % 60; + + if hours > 24 { + let days = hours / 24; + let hours = hours % 24; + format!("{}d {:02}:{:02}:{:02}", days, hours, mins, secs) + } else if hours > 0 { + format!("{:02}:{:02}:{:02}", hours, mins, secs) + } else { + format!("{:02}:{:02}", mins, secs) + } +} + +/// Format seconds as LSF duration format (HH:MM). +fn format_duration_lsf(seconds: u64) -> String { + let hours = seconds / 3600; + let mins = (seconds % 3600) / 60; + format!("{}:{:02}", hours, mins) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_lsf_memory() { + assert_eq!( + parse_lsf_memory("MAX MEM: 4.5 Gbytes", "MAX MEM:"), + Some(4608) + ); + assert_eq!( + parse_lsf_memory("MEMLIMIT 8192 Mbytes", "MEMLIMIT"), + Some(8192) + ); + } + + #[test] + fn test_parse_lsf_time() { + assert_eq!(parse_lsf_time("Run time: 01:30:00"), Some(5400)); + assert_eq!(parse_lsf_time("1800 seconds"), Some(1800)); + } +} diff --git a/crates/charmer-lsf/src/lib.rs b/crates/charmer-lsf/src/lib.rs new file mode 100644 index 0000000..595db61 --- /dev/null +++ b/crates/charmer-lsf/src/lib.rs @@ -0,0 +1,13 @@ +//! LSF integration for charmer. +//! +//! Query job status via bjobs and bhist. + +pub mod bhist; +pub mod bjobs; +pub mod failure; +pub mod types; + +pub use bhist::{query_bhist, BhistError}; +pub use bjobs::{query_bjobs, BjobsError}; +pub use failure::{analyze_failure, FailureAnalysis, FailureError, FailureMode}; +pub use types::{LsfJob, LsfJobState}; diff --git a/crates/charmer-lsf/src/types.rs b/crates/charmer-lsf/src/types.rs new file mode 100644 index 0000000..33565bf --- /dev/null +++ b/crates/charmer-lsf/src/types.rs @@ -0,0 +1,70 @@ +//! LSF job types. + +use chrono::{DateTime, Utc}; +use std::time::Duration; + +/// LSF job status. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LsfJobState { + /// PEND - Job is pending + Pending, + /// RUN - Job is running + Running, + /// DONE - Job completed successfully + Done { exit_code: i32, runtime: Duration }, + /// EXIT - Job exited with non-zero status + Exit { exit_code: i32, error: String }, + /// PSUSP - Job suspended by user while pending + UserSuspendedPending, + /// USUSP - Job suspended by user while running + UserSuspended, + /// SSUSP - Job suspended by system + SystemSuspended, + /// ZOMBI - Job is zombie (finished but info not available) + Zombie, + /// Unknown state + Unknown(String), +} + +/// LSF job information from bjobs/bhist. +#[derive(Debug, Clone)] +pub struct LsfJob { + /// LSF job ID + pub job_id: String, + + /// Job name + pub name: String, + + /// Job state + pub state: LsfJobState, + + /// Queue name + pub queue: Option, + + /// Submit time + pub submit_time: Option>, + + /// Start time + pub start_time: Option>, + + /// End time + pub end_time: Option>, + + /// Execution host(s) + pub exec_host: Option, + + /// Number of processors + pub nprocs: Option, + + /// Memory limit (MB) + pub mem_limit_mb: Option, + + /// Actual memory used (MB) + pub mem_used_mb: Option, + + /// Run limit (wall clock time) + pub run_limit: Option, + + /// Job description (used by snakemake for rule info) + pub description: Option, +} diff --git a/crates/charmer-monitor/Cargo.toml b/crates/charmer-monitor/Cargo.toml index 70f0bfd..0a2555b 100644 --- a/crates/charmer-monitor/Cargo.toml +++ b/crates/charmer-monitor/Cargo.toml @@ -10,3 +10,5 @@ ratatui.workspace = true crossterm.workspace = true tokio.workspace = true thiserror.workspace = true +chrono.workspace = true +arboard.workspace = true diff --git a/crates/charmer-monitor/src/app.rs b/crates/charmer-monitor/src/app.rs index 61aaeac..780da40 100644 --- a/crates/charmer-monitor/src/app.rs +++ b/crates/charmer-monitor/src/app.rs @@ -1,31 +1,983 @@ //! Main TUI application. -use charmer_state::PipelineState; +use crate::components::{ + Footer, Header, JobDetail, JobList, LogViewer, LogViewerState, RuleSummary, +}; +use crate::ui::Theme; +use charmer_state::{JobStatus, PipelineState, MAIN_PIPELINE_JOB_ID}; +use crossterm::event::{self, Event, KeyCode, KeyEvent, KeyModifiers}; +use ratatui::{ + layout::{Constraint, Direction, Layout, Rect}, + widgets::Clear, + Frame, +}; +use std::time::{Duration, Instant}; +/// Filter mode for job list. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum FilterMode { + #[default] + All, + Running, + Failed, + Pending, + Completed, +} + +impl FilterMode { + pub fn next(self) -> Self { + match self { + Self::All => Self::Running, + Self::Running => Self::Failed, + Self::Failed => Self::Pending, + Self::Pending => Self::Completed, + Self::Completed => Self::All, + } + } + + pub fn label(&self) -> &'static str { + match self { + Self::All => "All", + Self::Running => "Running", + Self::Failed => "Failed", + Self::Pending => "Pending", + Self::Completed => "Completed", + } + } + + pub fn matches(&self, status: JobStatus) -> bool { + match self { + Self::All => true, + Self::Running => matches!(status, JobStatus::Running), + Self::Failed => matches!(status, JobStatus::Failed), + Self::Pending => matches!(status, JobStatus::Pending | JobStatus::Queued), + Self::Completed => matches!(status, JobStatus::Completed), + } + } +} + +/// Sort mode for job list. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum SortMode { + #[default] + Status, + Rule, + Time, +} + +/// View mode for main panel. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum ViewMode { + /// Show individual jobs + #[default] + Jobs, + /// Show rule summary + Rules, +} + +impl SortMode { + pub fn next(self) -> Self { + match self { + Self::Status => Self::Rule, + Self::Rule => Self::Time, + Self::Time => Self::Status, + } + } + + pub fn label(&self) -> &'static str { + match self { + Self::Status => "Status", + Self::Rule => "Rule", + Self::Time => "Time", + } + } +} + +/// Main application state. pub struct App { pub state: PipelineState, pub should_quit: bool, - pub selected_job: Option, + pub selected_index: usize, + pub filter_mode: FilterMode, + pub sort_mode: SortMode, + pub view_mode: ViewMode, + pub show_help: bool, + pub show_log_viewer: bool, + pub log_viewer_state: Option, + pub theme: Theme, + pub last_tick: Instant, + job_ids: Vec, // Cached sorted/filtered job IDs + rule_names: Vec, // Cached rule names for rule view + status_message: Option<(String, Instant)>, // Temporary status message with timestamp + command_expanded: bool, // Whether command section is expanded in details } impl App { pub fn new(state: PipelineState) -> Self { - Self { + let job_ids = state.jobs.keys().cloned().collect(); + let rule_names: Vec = state.jobs_by_rule.keys().cloned().collect(); + let mut app = Self { state, should_quit: false, - selected_job: None, + selected_index: 0, + filter_mode: FilterMode::default(), + sort_mode: SortMode::default(), + view_mode: ViewMode::default(), + show_help: false, + show_log_viewer: false, + log_viewer_state: None, + theme: Theme::dark(), + last_tick: Instant::now(), + job_ids, + rule_names, + status_message: None, + command_expanded: false, + }; + // Update job list first to ensure MAIN_PIPELINE_JOB_ID is in the list + app.update_job_list(); + // Open log viewer by default + app.open_log_viewer(); + app + } + + /// Update cached job list based on filter and sort. + pub fn update_job_list(&mut self) { + let mut jobs: Vec<_> = self + .state + .jobs + .iter() + .filter(|(_, job)| self.filter_mode.matches(job.status)) + .collect(); + + // Sort jobs + match self.sort_mode { + SortMode::Status => { + jobs.sort_by_key(|(_, job)| { + // Target jobs (like "all") always go to the bottom, regardless of status + // They represent pipeline completion and should be last + if job.is_target { + return 10; + } + match job.status { + JobStatus::Running => 0, + JobStatus::Failed => 1, + JobStatus::Queued => 2, + JobStatus::Pending => 3, + JobStatus::Completed => 4, + JobStatus::Cancelled => 5, + JobStatus::Unknown => 6, + } + }); + } + SortMode::Rule => { + jobs.sort_by(|(_, a), (_, b)| { + // Target jobs go to the bottom in rule sort too + match (a.is_target, b.is_target) { + (true, false) => std::cmp::Ordering::Greater, + (false, true) => std::cmp::Ordering::Less, + _ => a.rule.cmp(&b.rule), + } + }); + } + SortMode::Time => { + jobs.sort_by(|(_, a), (_, b)| { + // Target jobs go to the bottom in time sort too + match (a.is_target, b.is_target) { + (true, false) => std::cmp::Ordering::Greater, + (false, true) => std::cmp::Ordering::Less, + _ => { + let a_time = a.timing.started_at.or(a.timing.queued_at); + let b_time = b.timing.started_at.or(b.timing.queued_at); + b_time.cmp(&a_time) // Most recent first + } + } + }); + } } + + // Build job IDs list with main pipeline job at top + self.job_ids = Vec::with_capacity(jobs.len() + 1); + + // Always add main pipeline job at the top (when viewing all or running) + if matches!(self.filter_mode, FilterMode::All | FilterMode::Running) { + self.job_ids.push(MAIN_PIPELINE_JOB_ID.to_string()); + } + + // Add sorted job IDs + self.job_ids + .extend(jobs.into_iter().map(|(id, _)| id.clone())); + + // Clamp selection + if !self.job_ids.is_empty() { + self.selected_index = self.selected_index.min(self.job_ids.len() - 1); + } else { + self.selected_index = 0; + } + } + + /// Get the currently selected job. + /// Returns None if the main pipeline job is selected (it's synthetic). + pub fn selected_job(&self) -> Option<&charmer_state::Job> { + self.job_ids.get(self.selected_index).and_then(|id| { + if id == MAIN_PIPELINE_JOB_ID { + None // Main pipeline job is synthetic + } else { + self.state.jobs.get(id) + } + }) + } + + /// Check if the main pipeline job is currently selected. + pub fn is_main_pipeline_selected(&self) -> bool { + self.job_ids + .get(self.selected_index) + .map(|id| id == MAIN_PIPELINE_JOB_ID) + .unwrap_or(false) + } + + /// Get the currently selected job ID. + pub fn selected_job_id(&self) -> Option<&str> { + self.job_ids.get(self.selected_index).map(|s| s.as_str()) + } + + /// Get filtered job IDs. + pub fn filtered_jobs(&self) -> &[String] { + &self.job_ids } pub fn quit(&mut self) { self.should_quit = true; } + /// Get the list length based on current view mode. + fn list_len(&self) -> usize { + match self.view_mode { + ViewMode::Jobs => self.job_ids.len(), + ViewMode::Rules => self.rule_names.len(), + } + } + pub fn select_next(&mut self) { - // TODO: Implement job selection + let len = self.list_len(); + if len > 0 { + self.selected_index = (self.selected_index + 1) % len; + self.command_expanded = false; // Reset expansion when navigating + } } pub fn select_previous(&mut self) { - // TODO: Implement job selection + let len = self.list_len(); + if len > 0 { + self.selected_index = self.selected_index.checked_sub(1).unwrap_or(len - 1); + self.command_expanded = false; // Reset expansion when navigating + } + } + + pub fn select_first(&mut self) { + self.selected_index = 0; + } + + pub fn select_last(&mut self) { + let len = self.list_len(); + if len > 0 { + self.selected_index = len - 1; + } + } + + pub fn cycle_filter(&mut self) { + self.filter_mode = self.filter_mode.next(); + self.update_job_list(); + } + + pub fn cycle_sort(&mut self) { + self.sort_mode = self.sort_mode.next(); + self.update_job_list(); + } + + pub fn toggle_help(&mut self) { + self.show_help = !self.show_help; + } + + /// Toggle between jobs and rules view. + pub fn toggle_view_mode(&mut self) { + self.view_mode = match self.view_mode { + ViewMode::Jobs => ViewMode::Rules, + ViewMode::Rules => ViewMode::Jobs, + }; + // Reset selection when switching views + self.selected_index = 0; + // Update rule names list when switching to rules view + if self.view_mode == ViewMode::Rules { + self.update_rule_list(); + } + } + + /// Update the cached rule names list. + fn update_rule_list(&mut self) { + let mut rules: Vec<_> = self.state.jobs_by_rule.keys().cloned().collect(); + rules.sort(); + self.rule_names = rules; + } + + /// Get the currently selected rule name (in rules view). + pub fn selected_rule(&self) -> Option<&str> { + if self.view_mode == ViewMode::Rules { + self.rule_names.get(self.selected_index).map(|s| s.as_str()) + } else { + None + } + } + + /// Toggle log viewer for the currently selected job. + pub fn toggle_log_viewer(&mut self) { + if self.show_log_viewer { + self.close_log_viewer(); + return; + } + self.open_log_viewer(); + } + + /// Find the best log file path for a job. + fn find_log_path(&self, job: &charmer_state::Job) -> String { + let working_dir = &self.state.working_dir; + + // Try log files from snakemake metadata first + for log_file in &job.log_files { + let full_path = working_dir.join(log_file); + if full_path.exists() { + return full_path.to_string(); + } + // Also try as-is (might be absolute) + if std::path::Path::new(log_file).exists() { + return log_file.clone(); + } + } + + // Try SLURM log path format: .snakemake/slurm_logs/rule_{rule}/{slurm_job_id}.log + if let Some(ref slurm_id) = job.slurm_job_id { + let slurm_log = working_dir + .join(".snakemake") + .join("slurm_logs") + .join(format!("rule_{}", job.rule)) + .join(format!("{}.log", slurm_id)); + if slurm_log.exists() { + return slurm_log.to_string(); + } + } + + // Try common log directory patterns + let wildcards_suffix = job + .wildcards + .as_ref() + .map(|w| { + // Extract just the values, e.g., "sample=sample1, chrom=chr1" -> "sample1" + w.split(',') + .next() + .and_then(|s| s.split('=').nth(1)) + .unwrap_or("") + }) + .unwrap_or(""); + + // Try logs/{rule}/{wildcard}.log + if !wildcards_suffix.is_empty() { + let pattern_log = working_dir + .join("logs") + .join(&job.rule) + .join(format!("{}.log", wildcards_suffix)); + if pattern_log.exists() { + return pattern_log.to_string(); + } + } + + // Try logs/{rule}.log + let rule_log = working_dir.join("logs").join(format!("{}.log", job.rule)); + if rule_log.exists() { + return rule_log.to_string(); + } + + // Try main snakemake log as fallback (most recent .snakemake.log file) + if let Some(main_log) = self.find_latest_snakemake_log() { + return main_log; + } + + // Fallback: return a path that shows what we're looking for + if !job.log_files.is_empty() { + working_dir.join(&job.log_files[0]).to_string() + } else { + working_dir + .join("logs") + .join(&job.rule) + .join(format!("{}.log", wildcards_suffix)) + .to_string() + } + } + + /// Find the most recent main snakemake log file. + fn find_latest_snakemake_log(&self) -> Option { + let log_dir = self.state.working_dir.join(".snakemake").join("log"); + if !log_dir.exists() { + return None; + } + + let mut latest: Option<(std::time::SystemTime, String)> = None; + + if let Ok(entries) = std::fs::read_dir(&log_dir) { + for entry in entries.flatten() { + let path = entry.path(); + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + if name.ends_with(".snakemake.log") { + if let Ok(metadata) = entry.metadata() { + if let Ok(modified) = metadata.modified() { + let path_str = path.to_string_lossy().to_string(); + if latest.is_none() || modified > latest.as_ref().unwrap().0 { + latest = Some((modified, path_str)); + } + } + } + } + } + } + } + + latest.map(|(_, path)| path) + } + + /// Open log viewer for the currently selected job. + fn open_log_viewer(&mut self) { + let log_path = if self.is_main_pipeline_selected() { + // For main pipeline job, show the main snakemake log + self.find_latest_snakemake_log() + .unwrap_or_else(|| "(no snakemake log found)".to_string()) + } else if let Some(job) = self.selected_job().cloned() { + self.find_log_path(&job) + } else { + return; + }; + + let mut state = LogViewerState::new(log_path, 1000); + state.follow_mode = true; // Enable follow mode by default for panel view + self.log_viewer_state = Some(state); + self.show_log_viewer = true; + } + + /// Update log viewer to show the currently selected job's logs. + fn update_log_viewer_for_selected(&mut self) { + let log_path = if self.is_main_pipeline_selected() { + // For main pipeline job, show the main snakemake log + self.find_latest_snakemake_log() + .unwrap_or_else(|| "(no snakemake log found)".to_string()) + } else if let Some(job) = self.selected_job().cloned() { + self.find_log_path(&job) + } else { + return; + }; + + let mut state = LogViewerState::new(log_path, 1000); + state.follow_mode = true; + self.log_viewer_state = Some(state); + } + + /// Close the log viewer. + pub fn close_log_viewer(&mut self) { + self.show_log_viewer = false; + self.log_viewer_state = None; + } + + /// Refresh the log viewer content. + pub fn refresh_log_viewer(&mut self) { + if let Some(ref state) = self.log_viewer_state { + let log_path = state.log_path.clone(); + let follow = state.follow_mode; + self.log_viewer_state = Some(LogViewerState::new(log_path, 1000)); + if follow { + if let Some(ref mut new_state) = self.log_viewer_state { + new_state.follow_mode = true; + new_state.scroll_to_bottom(); + } + } + } + } + + /// Update app state from external source (polling service). + pub fn update_from_state(&mut self, new_state: PipelineState) { + self.state = new_state; + self.update_job_list(); + + // Refresh log viewer if in follow mode + if self.show_log_viewer { + if let Some(ref state) = self.log_viewer_state { + if state.follow_mode { + self.refresh_log_viewer(); + } + } + } + } + + /// Copy the selected job's shell command to clipboard. + fn copy_command(&mut self) { + let job = self.selected_job(); + if let Some(job) = job { + let cmd = job.shellcmd.trim(); + if cmd.is_empty() { + self.status_message = Some(("No command to copy".to_string(), Instant::now())); + return; + } + + match arboard::Clipboard::new() { + Ok(mut clipboard) => match clipboard.set_text(cmd) { + Ok(()) => { + self.status_message = + Some(("Command copied to clipboard".to_string(), Instant::now())); + } + Err(_) => { + self.status_message = + Some(("Failed to copy to clipboard".to_string(), Instant::now())); + } + }, + Err(_) => { + self.status_message = + Some(("Clipboard not available".to_string(), Instant::now())); + } + } + } else { + self.status_message = Some(("No job selected".to_string(), Instant::now())); + } + } + + /// Handle a key event. + pub fn handle_key(&mut self, key: KeyEvent) { + // If help is showing, any key closes it + if self.show_help { + self.show_help = false; + return; + } + + match key.code { + KeyCode::Char('q') => self.quit(), + KeyCode::Char('c') if key.modifiers.contains(KeyModifiers::CONTROL) => self.quit(), + KeyCode::Char('j') | KeyCode::Down => { + self.select_next(); + // Update log viewer to show new job's logs + if self.show_log_viewer { + self.update_log_viewer_for_selected(); + } + } + KeyCode::Char('k') | KeyCode::Up => { + self.select_previous(); + // Update log viewer to show new job's logs + if self.show_log_viewer { + self.update_log_viewer_for_selected(); + } + } + KeyCode::Char('g') | KeyCode::Home => self.select_first(), + KeyCode::Char('G') | KeyCode::End => self.select_last(), + KeyCode::Char('f') => self.cycle_filter(), + KeyCode::Char('s') => self.cycle_sort(), + KeyCode::Char('r') => self.toggle_view_mode(), + KeyCode::Char('l') | KeyCode::Enter => self.toggle_log_viewer(), + KeyCode::Char('F') if self.show_log_viewer => { + // Toggle follow mode when log panel is open + if let Some(ref mut state) = self.log_viewer_state { + state.toggle_follow(); + } + } + KeyCode::Char('?') => self.toggle_help(), + KeyCode::Char('c') => self.copy_command(), + KeyCode::Char('e') => self.command_expanded = !self.command_expanded, + _ => {} + } + } + + /// Poll for events and handle them. + pub fn poll_events(&mut self, timeout: Duration) -> std::io::Result { + if event::poll(timeout)? { + if let Event::Key(key) = event::read()? { + self.handle_key(key); + return Ok(true); + } + } + Ok(false) + } + + /// Render the UI. + pub fn render(&self, frame: &mut Frame) { + // Adjust layout based on whether log panel is open + let chunks = if self.show_log_viewer { + Layout::default() + .direction(Direction::Vertical) + .constraints([ + Constraint::Length(3), // Header (1 line + borders) + Constraint::Min(8), // Main content (smaller when logs open) + Constraint::Length(12), // Log panel + Constraint::Length(1), // Footer + ]) + .split(frame.area()) + } else { + Layout::default() + .direction(Direction::Vertical) + .constraints([ + Constraint::Length(3), // Header (1 line + borders) + Constraint::Min(10), // Main content + Constraint::Length(0), // No log panel + Constraint::Length(1), // Footer + ]) + .split(frame.area()) + }; + + // Header + Header::render(frame, chunks[0], &self.state); + + // Main content: split horizontally + let main_chunks = Layout::default() + .direction(Direction::Horizontal) + .constraints([Constraint::Percentage(50), Constraint::Percentage(50)]) + .split(chunks[1]); + + // Render based on view mode - tabs are now in the block titles + match self.view_mode { + ViewMode::Jobs => { + // Job list (left) and detail (right) + JobList::render( + frame, + main_chunks[0], + &self.state, + &self.job_ids, + Some(self.selected_index), + self.filter_mode.label(), + self.sort_mode.label(), + ); + + // Render job detail or pipeline summary + if self.is_main_pipeline_selected() { + JobDetail::render_pipeline(frame, main_chunks[1], &self.state); + } else { + JobDetail::render( + frame, + main_chunks[1], + self.selected_job(), + self.command_expanded, + ); + } + } + ViewMode::Rules => { + // Rule summary table (left panel) + RuleSummary::render( + frame, + main_chunks[0], + &self.state, + &self.rule_names, + Some(self.selected_index), + ); + + // Show stats for selected rule in right panel + if let Some(rule) = self.selected_rule() { + self.render_rule_detail(frame, main_chunks[1], rule); + } + } + } + + // Log panel at bottom (if open) + if self.show_log_viewer { + self.render_log_panel(frame, chunks[2]); + } + + // Get recent status message (within 3 seconds) + let status_msg = self.status_message.as_ref().and_then(|(msg, timestamp)| { + if timestamp.elapsed() < Duration::from_secs(3) { + Some(msg.as_str()) + } else { + None + } + }); + + // Footer with optional status message + Footer::render(frame, chunks[3], status_msg); + + // Help overlay (on top of everything) + if self.show_help { + self.render_help_overlay(frame); + } + } + + /// Render detail panel for selected rule. + fn render_rule_detail(&self, frame: &mut Frame, area: Rect, rule: &str) { + use ratatui::style::{Color, Modifier, Style}; + use ratatui::text::{Line, Span}; + use ratatui::widgets::{Block, Borders, Paragraph}; + + let mut lines = Vec::new(); + + // Rule name + lines.push(Line::from(vec![ + Span::styled("Rule: ", Style::default().fg(Color::Gray)), + Span::styled( + rule.to_string(), + Style::default() + .fg(Color::Cyan) + .add_modifier(Modifier::BOLD), + ), + ])); + + lines.push(Line::from("")); + + // Get jobs for this rule + let mut running = 0; + let mut completed = 0; + let mut failed = 0; + let mut pending = 0; + + if let Some(job_ids) = self.state.jobs_by_rule.get(rule) { + let mut total_runtime: u64 = 0; + let mut completed_count = 0; + + for id in job_ids { + if let Some(job) = self.state.jobs.get(id) { + match job.status { + JobStatus::Running => running += 1, + JobStatus::Completed => { + completed += 1; + if let (Some(start), Some(end)) = + (job.timing.started_at, job.timing.completed_at) + { + total_runtime += (end - start).num_seconds().max(0) as u64; + completed_count += 1; + } + } + JobStatus::Failed => failed += 1, + JobStatus::Pending | JobStatus::Queued => pending += 1, + _ => {} + } + } + } + + // Stats section + lines.push(Line::from(Span::styled( + "Statistics", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + + lines.push(Line::from(vec![ + Span::styled(" Total: ", Style::default().fg(Color::Gray)), + Span::styled(job_ids.len().to_string(), Style::default().fg(Color::White)), + ])); + + lines.push(Line::from(vec![ + Span::styled(" Running: ", Style::default().fg(Color::Gray)), + Span::styled( + running.to_string(), + Style::default().fg(if running > 0 { + Color::Yellow + } else { + Color::Gray + }), + ), + ])); + + lines.push(Line::from(vec![ + Span::styled(" Completed: ", Style::default().fg(Color::Gray)), + Span::styled(completed.to_string(), Style::default().fg(Color::Green)), + ])); + + lines.push(Line::from(vec![ + Span::styled(" Failed: ", Style::default().fg(Color::Gray)), + Span::styled( + failed.to_string(), + Style::default().fg(if failed > 0 { Color::Red } else { Color::Gray }), + ), + ])); + + lines.push(Line::from(vec![ + Span::styled(" Pending: ", Style::default().fg(Color::Gray)), + Span::styled(pending.to_string(), Style::default().fg(Color::Blue)), + ])); + + // Timing section + if completed_count > 0 { + lines.push(Line::from("")); + lines.push(Line::from(Span::styled( + "Timing", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + + let avg_secs = total_runtime / completed_count as u64; + lines.push(Line::from(vec![ + Span::styled(" Avg runtime: ", Style::default().fg(Color::Gray)), + Span::styled(format_secs(avg_secs), Style::default().fg(Color::Yellow)), + ])); + + lines.push(Line::from(vec![ + Span::styled(" Total runtime: ", Style::default().fg(Color::Gray)), + Span::styled( + format_secs(total_runtime), + Style::default().fg(Color::Green), + ), + ])); + } + + // Progress + let progress = if !job_ids.is_empty() { + completed * 100 / job_ids.len() + } else { + 0 + }; + + lines.push(Line::from("")); + lines.push(Line::from(vec![ + Span::styled(" Progress: ", Style::default().fg(Color::Gray)), + Span::styled( + format!("{}%", progress), + Style::default() + .fg(Color::Green) + .add_modifier(Modifier::BOLD), + ), + ])); + } + + // Add job status as colored text (compact display) + let total = running + completed + failed + pending; + if total > 0 { + lines.push(Line::from("")); + lines.push(Line::from(Span::styled( + "Job Status", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + + // Compact colored status: ▶3 ✓12 ✗1 ○5 + let mut status_spans = vec![Span::raw(" ")]; + + if running > 0 { + status_spans.push(Span::styled( + format!("▶{}", running), + Style::default() + .fg(Color::Yellow) + .add_modifier(Modifier::BOLD), + )); + status_spans.push(Span::raw(" ")); + } + + status_spans.push(Span::styled( + format!("✓{}", completed), + Style::default() + .fg(Color::Green) + .add_modifier(Modifier::BOLD), + )); + status_spans.push(Span::raw(" ")); + + if failed > 0 { + status_spans.push(Span::styled( + format!("✗{}", failed), + Style::default().fg(Color::Red).add_modifier(Modifier::BOLD), + )); + status_spans.push(Span::raw(" ")); + } + + if pending > 0 { + status_spans.push(Span::styled( + format!("○{}", pending), + Style::default() + .fg(Color::Blue) + .add_modifier(Modifier::BOLD), + )); + } + + lines.push(Line::from(status_spans)); + } + + let paragraph = Paragraph::new(lines).block( + Block::default() + .borders(Borders::ALL) + .title(" Rule Details "), + ); + frame.render_widget(paragraph, area); + } + + fn render_log_panel(&self, frame: &mut Frame, area: Rect) { + if let Some(ref state) = self.log_viewer_state { + // Render log viewer as a bottom panel (tailed output) + LogViewer::render_panel(frame, area, state); + } + } + + fn render_help_overlay(&self, frame: &mut Frame) { + use ratatui::style::{Color, Style}; + use ratatui::widgets::{Block, Borders, Paragraph}; + + let area = centered_rect(60, 60, frame.area()); + + let help_text = r#" + Keyboard Shortcuts + ────────────────── + + j / ↓ Move down (also updates log panel) + k / ↑ Move up (also updates log panel) + g / Home Go to first item + G / End Go to last item + r Toggle view (Jobs/Rules summary) + d Toggle DAG view + f Cycle filter (All/Running/Failed/Pending/Completed) + s Cycle sort (Status/Rule/Time) + l / Enter Toggle log panel + F Toggle follow mode (when logs open) + c Copy command to clipboard + e Expand/collapse command + ? Toggle this help + q / Ctrl+C Quit + + Press any key to close +"#; + + frame.render_widget(Clear, area); + let paragraph = Paragraph::new(help_text) + .block( + Block::default() + .borders(Borders::ALL) + .title(" Help ") + .style(Style::default().bg(Color::DarkGray)), + ) + .style(Style::default().fg(Color::White).bg(Color::DarkGray)); + + frame.render_widget(paragraph, area); + } +} + +/// Create a centered rectangle. +fn centered_rect(percent_x: u16, percent_y: u16, area: Rect) -> Rect { + let popup_layout = Layout::default() + .direction(Direction::Vertical) + .constraints([ + Constraint::Percentage((100 - percent_y) / 2), + Constraint::Percentage(percent_y), + Constraint::Percentage((100 - percent_y) / 2), + ]) + .split(area); + + Layout::default() + .direction(Direction::Horizontal) + .constraints([ + Constraint::Percentage((100 - percent_x) / 2), + Constraint::Percentage(percent_x), + Constraint::Percentage((100 - percent_x) / 2), + ]) + .split(popup_layout[1])[1] +} + +/// Format seconds as human-readable duration. +fn format_secs(secs: u64) -> String { + if secs >= 3600 { + let hours = secs / 3600; + let mins = (secs % 3600) / 60; + format!("{}h{}m", hours, mins) + } else if secs >= 60 { + let mins = secs / 60; + let secs = secs % 60; + format!("{}m{}s", mins, secs) + } else { + format!("{}s", secs) } } diff --git a/crates/charmer-monitor/src/components/footer.rs b/crates/charmer-monitor/src/components/footer.rs index dc612a9..b23e4ac 100644 --- a/crates/charmer-monitor/src/components/footer.rs +++ b/crates/charmer-monitor/src/components/footer.rs @@ -1,19 +1,51 @@ -//! Footer component with keyboard shortcuts. +//! Footer component with keyboard shortcuts and status messages. use ratatui::{ - layout::Rect, - style::{Color, Style}, + layout::{Constraint, Direction, Layout, Rect}, + style::{Color, Modifier, Style}, + text::{Line, Span}, widgets::Paragraph, Frame, }; +/// Version from Cargo.toml +const VERSION: &str = env!("CARGO_PKG_VERSION"); + pub struct Footer; impl Footer { - pub fn render(frame: &mut Frame, area: Rect) { - let help = "j/k:navigate l:logs f:filter s:sort ?:help q:quit"; - let paragraph = Paragraph::new(help) - .style(Style::default().fg(Color::DarkGray)); - frame.render_widget(paragraph, area); + pub fn render(frame: &mut Frame, area: Rect, status_message: Option<&str>) { + let help = "j/k:navigate l:logs r:rules f:filter s:sort ?:help q:quit"; + let version = format!("v{}", VERSION); + + // Split footer into left (help/status), right (version) + let chunks = Layout::default() + .direction(Direction::Horizontal) + .constraints([ + Constraint::Min(1), + Constraint::Length(version.len() as u16 + 1), + ]) + .split(area); + + // Show status message if present, otherwise show help + let left_content = if let Some(msg) = status_message { + Line::from(Span::styled( + msg.to_string(), + Style::default() + .fg(Color::Magenta) + .add_modifier(Modifier::BOLD), + )) + } else { + Line::from(Span::styled(help, Style::default().fg(Color::Gray))) + }; + + let help_paragraph = Paragraph::new(left_content); + frame.render_widget(help_paragraph, chunks[0]); + + let version_paragraph = Paragraph::new(Line::from(Span::styled( + version, + Style::default().fg(Color::Gray), + ))); + frame.render_widget(version_paragraph, chunks[1]); } } diff --git a/crates/charmer-monitor/src/components/header.rs b/crates/charmer-monitor/src/components/header.rs index 66a3b96..73f6be3 100644 --- a/crates/charmer-monitor/src/components/header.rs +++ b/crates/charmer-monitor/src/components/header.rs @@ -1,28 +1,114 @@ -//! Header component with progress bar. +//! Header component with dense single-line info display. +use charmer_state::PipelineState; +use chrono::Local; use ratatui::{ layout::Rect, - widgets::{Block, Borders, Gauge}, + style::{Color, Modifier, Style}, + text::{Line, Span}, + widgets::{Block, Borders, Paragraph}, Frame, }; -use charmer_state::PipelineState; pub struct Header; impl Header { pub fn render(frame: &mut Frame, area: Rect, state: &PipelineState) { - let counts = state.job_counts(); - let progress = if counts.total > 0 { - (counts.completed as f64 / counts.total as f64) * 100.0 + // Current date/time + let now = Local::now(); + let datetime = now.format("%Y-%m-%d %H:%M:%S").to_string(); + + // Truncate working dir to fit in header + let working_dir = state.working_dir.as_str(); + let max_dir_len = (area.width as usize).saturating_sub(80); // Leave room for other elements + let dir_display = if working_dir.len() > max_dir_len && max_dir_len > 3 { + format!("…{}", &working_dir[working_dir.len() - max_dir_len + 1..]) } else { - 0.0 + working_dir.to_string() }; - let gauge = Gauge::default() - .block(Block::default().borders(Borders::ALL).title("charmer")) - .percent(progress as u16) - .label(format!("{}/{} jobs", counts.completed, counts.total)); + let sep = Span::styled(" │ ", Style::default().fg(Color::DarkGray)); + + let mut spans = Vec::new(); + + // App name with status + spans.push(Span::styled( + "🐍 charmer", + Style::default() + .fg(Color::Cyan) + .add_modifier(Modifier::BOLD), + )); + + // Status indicator + if state.pipeline_finished { + spans.push(Span::raw(" ")); + spans.push(Span::styled( + "✓", + Style::default() + .fg(Color::Green) + .add_modifier(Modifier::BOLD), + )); + } else if !state.pipeline_errors.is_empty() { + spans.push(Span::raw(" ")); + spans.push(Span::styled( + "✗", + Style::default().fg(Color::Red).add_modifier(Modifier::BOLD), + )); + } + + spans.push(sep.clone()); + + // Working directory + spans.push(Span::styled(dir_display, Style::default().fg(Color::White))); + + // ETA (only if running and available) + if let Some(eta) = state.eta_string() { + if !state.pipeline_finished && state.pipeline_errors.is_empty() { + spans.push(sep.clone()); + spans.push(Span::styled("ETA: ", Style::default().fg(Color::Gray))); + spans.push(Span::styled( + eta, + Style::default() + .fg(Color::Yellow) + .add_modifier(Modifier::BOLD), + )); + } + } + + spans.push(sep.clone()); + spans.push(Span::styled(datetime, Style::default().fg(Color::Green))); + + // Status counts (abbreviated) + let counts = state.job_counts(); + spans.push(sep.clone()); + spans.push(Span::styled( + format!("{} Pend", counts.pending + counts.queued), + Style::default().fg(Color::White), + )); + spans.push(sep.clone()); + spans.push(Span::styled( + format!("{} Run", counts.running), + Style::default().fg(Color::Yellow), + )); + spans.push(sep.clone()); + spans.push(Span::styled( + format!("{} Done", counts.completed), + Style::default().fg(Color::Green), + )); + spans.push(sep); + spans.push(Span::styled( + format!("{} Fail", counts.failed), + Style::default().fg(if counts.failed > 0 { + Color::Red + } else { + Color::DarkGray + }), + )); + + let content = Line::from(spans); + let block = Block::default().borders(Borders::ALL); + let paragraph = Paragraph::new(content).block(block); - frame.render_widget(gauge, area); + frame.render_widget(paragraph, area); } } diff --git a/crates/charmer-monitor/src/components/job_detail.rs b/crates/charmer-monitor/src/components/job_detail.rs index 690b128..d254518 100644 --- a/crates/charmer-monitor/src/components/job_detail.rs +++ b/crates/charmer-monitor/src/components/job_detail.rs @@ -1,47 +1,835 @@ -//! Job detail panel. +//! Job detail panel with rich formatting. +use charmer_state::{EnvType, ExecutionEnvironment, FailureMode, Job, JobStatus, PipelineState}; +use chrono::Utc; use ratatui::{ layout::Rect, + style::{Color, Modifier, Style}, + text::{Line, Span}, widgets::{Block, Borders, Paragraph}, Frame, }; -use charmer_state::Job; + +/// Color palette for wildcard values (matches job_list.rs). +const WILDCARD_COLORS: [Color; 6] = [ + Color::Cyan, + Color::Magenta, + Color::Yellow, + Color::Green, + Color::Blue, + Color::Red, +]; pub struct JobDetail; impl JobDetail { - pub fn render(frame: &mut Frame, area: Rect, job: Option<&Job>) { + pub fn render(frame: &mut Frame, area: Rect, job: Option<&Job>, command_expanded: bool) { let content = match job { - Some(job) => { - let mut lines = vec![ - format!("Rule: {}", job.rule), - format!("Status: {:?}", job.status), - ]; - - if let Some(ref slurm_id) = job.slurm_job_id { - lines.push(format!("SLURM Job: {}", slurm_id)); + Some(job) => build_detail_lines(job, command_expanded), + None => vec![Line::from(Span::styled( + "No job selected", + Style::default().fg(Color::DarkGray), + ))], + }; + + let paragraph = Paragraph::new(content).block( + Block::default() + .borders(Borders::ALL) + .title(" Job Details "), + ); + + frame.render_widget(paragraph, area); + } + + /// Render pipeline summary when main snakemake job is selected. + pub fn render_pipeline(frame: &mut Frame, area: Rect, state: &PipelineState) { + let content = build_pipeline_lines(state); + + let paragraph = Paragraph::new(content).block( + Block::default() + .borders(Borders::ALL) + .title(" Job Details "), + ); + + frame.render_widget(paragraph, area); + } +} + +/// Build detail lines for pipeline summary. +fn build_pipeline_lines(state: &PipelineState) -> Vec> { + let mut lines = Vec::new(); + let counts = state.job_counts(); + + // Title + lines.push(Line::from(vec![Span::styled( + "Snakemake Pipeline", + Style::default() + .fg(Color::Cyan) + .add_modifier(Modifier::BOLD), + )])); + + lines.push(Line::from("")); + + // Status + let (status_text, status_color) = if state.pipeline_finished { + ("Completed", Color::Green) + } else if !state.pipeline_errors.is_empty() { + ("Failed", Color::Red) + } else { + ("Running", Color::Yellow) + }; + + lines.push(Line::from(vec![ + Span::styled("Status: ", Style::default().fg(Color::Gray)), + Span::styled( + status_text, + Style::default() + .fg(status_color) + .add_modifier(Modifier::BOLD), + ), + ])); + + // Host + if let Some(ref host) = state.host { + lines.push(Line::from(vec![ + Span::styled("Host: ", Style::default().fg(Color::Gray)), + Span::styled(host.clone(), Style::default().fg(Color::White)), + ])); + } + + // Cores + if let Some(cores) = state.cores { + lines.push(Line::from(vec![ + Span::styled("Cores: ", Style::default().fg(Color::Gray)), + Span::styled(cores.to_string(), Style::default().fg(Color::White)), + ])); + } + + lines.push(Line::from("")); + + // Progress section + lines.push(Line::from(Span::styled( + "Progress", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + + // Total jobs + if let Some(total) = state.total_jobs { + lines.push(Line::from(vec![ + Span::styled(" Total: ", Style::default().fg(Color::Gray)), + Span::styled(total.to_string(), Style::default().fg(Color::White)), + Span::styled(" jobs", Style::default().fg(Color::Gray)), + ])); + + // Progress percentage + let progress = counts.completed as f64 / total as f64 * 100.0; + lines.push(Line::from(vec![ + Span::styled(" Progress: ", Style::default().fg(Color::Gray)), + Span::styled( + format!("{:.0}%", progress), + Style::default().fg(Color::Green), + ), + Span::styled( + format!(" ({}/{})", counts.completed, total), + Style::default().fg(Color::Gray), + ), + ])); + } + + // Job breakdown + lines.push(Line::from(vec![ + Span::styled(" Running: ", Style::default().fg(Color::Gray)), + Span::styled( + counts.running.to_string(), + Style::default().fg(Color::Yellow), + ), + ])); + + lines.push(Line::from(vec![ + Span::styled(" Completed: ", Style::default().fg(Color::Gray)), + Span::styled( + counts.completed.to_string(), + Style::default().fg(Color::Green), + ), + ])); + + lines.push(Line::from(vec![ + Span::styled(" Failed: ", Style::default().fg(Color::Gray)), + Span::styled( + counts.failed.to_string(), + Style::default().fg(if counts.failed > 0 { + Color::Red + } else { + Color::Gray + }), + ), + ])); + + lines.push(Line::from(vec![ + Span::styled(" Pending: ", Style::default().fg(Color::Gray)), + Span::styled( + (counts.pending + counts.queued).to_string(), + Style::default().fg(Color::Blue), + ), + ])); + + // Errors section + if !state.pipeline_errors.is_empty() { + lines.push(Line::from("")); + lines.push(Line::from(Span::styled( + "Errors", + Style::default() + .fg(Color::Red) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + + for error in state.pipeline_errors.iter().take(3) { + // Error type with icon + let label = format!("{} {}", error.icon(), error.label()); + let mut spans = vec![ + Span::styled(" ", Style::default()), + Span::styled( + label, + Style::default().fg(Color::Red).add_modifier(Modifier::BOLD), + ), + ]; + + // Add rule name if available + if let Some(ref rule) = error.rule { + spans.push(Span::styled( + format!(" ({})", rule), + Style::default().fg(Color::Yellow), + )); + } + + // Add exit code if available + if let Some(code) = error.exit_code { + spans.push(Span::styled( + format!(" [exit {}]", code), + Style::default().fg(Color::Gray), + )); + } + + lines.push(Line::from(spans)); + + // Show first detail if available + if let Some(detail) = error.details.first() { + let msg = if detail.len() > 42 { + format!("...{}", &detail[detail.len() - 39..]) + } else { + detail.clone() + }; + lines.push(Line::from(vec![ + Span::styled(" ", Style::default()), + Span::styled(msg, Style::default().fg(Color::Gray)), + ])); + } + } + + if state.pipeline_errors.len() > 3 { + lines.push(Line::from(Span::styled( + format!(" (+{} more)", state.pipeline_errors.len() - 3), + Style::default().fg(Color::Gray), + ))); + } + } + + lines +} + +fn build_detail_lines(job: &Job, command_expanded: bool) -> Vec> { + let mut lines = Vec::new(); + + // Rule name with color + lines.push(Line::from(vec![ + Span::styled("Rule: ", Style::default().fg(Color::Gray)), + Span::styled( + job.rule.clone(), + Style::default() + .fg(Color::Cyan) + .add_modifier(Modifier::BOLD), + ), + ])); + + // Wildcards / Sample info - colored to match job list + if let Some(ref wildcards) = job.wildcards { + let mut spans = vec![Span::styled( + "Wildcards: ", + Style::default().fg(Color::Gray), + )]; + + // Parse and color each wildcard: key in white, value in color + let pairs: Vec<(&str, &str)> = wildcards + .split(',') + .filter_map(|part| { + part.trim() + .split_once('=') + .map(|(k, v)| (k.trim(), v.trim())) + }) + .collect(); + + for (i, (key, value)) in pairs.iter().enumerate() { + if i > 0 { + spans.push(Span::styled(", ", Style::default().fg(Color::DarkGray))); + } + // Key in white + spans.push(Span::styled( + format!("{}=", key), + Style::default().fg(Color::White), + )); + // Value in color + let color = WILDCARD_COLORS[i % WILDCARD_COLORS.len()]; + spans.push(Span::styled(value.to_string(), Style::default().fg(color))); + } + + lines.push(Line::from(spans)); + } else { + // Try to extract sample from output path + if let Some(sample) = + extract_sample_from_path(&job.outputs.first().cloned().unwrap_or_default()) + { + lines.push(Line::from(vec![ + Span::styled("Sample: ", Style::default().fg(Color::Gray)), + Span::styled(sample, Style::default().fg(Color::Yellow)), + ])); + } + } + + lines.push(Line::from("")); + + // Status with appropriate color + let (status_text, status_color) = match job.status { + JobStatus::Running => ("Running", Color::Yellow), + JobStatus::Completed => ("Completed", Color::Green), + JobStatus::Failed => ("Failed", Color::Red), + JobStatus::Queued => ("Queued", Color::Blue), + JobStatus::Pending => ("Pending", Color::White), + JobStatus::Cancelled => ("Cancelled", Color::Magenta), + JobStatus::Unknown => ("Unknown", Color::DarkGray), + }; + lines.push(Line::from(vec![ + Span::styled("Status: ", Style::default().fg(Color::Gray)), + Span::styled( + format!("{} {}", job.status.symbol(), status_text), + Style::default() + .fg(status_color) + .add_modifier(Modifier::BOLD), + ), + ])); + + // SLURM/LSF Job ID + if let Some(ref slurm_id) = job.slurm_job_id { + lines.push(Line::from(vec![ + Span::styled("Job ID: ", Style::default().fg(Color::Gray)), + Span::styled(slurm_id.clone(), Style::default().fg(Color::Cyan)), + ])); + } + + // Execution environment + let env = ExecutionEnvironment::detect( + &job.shellcmd, + job.conda_env.as_deref(), + job.container_img_url.as_deref(), + ); + if env.env_type != EnvType::Direct { + let (env_label, env_color) = match env.env_type { + EnvType::Pixi => ("Pixi", Color::Magenta), + EnvType::Conda => ("Conda", Color::Green), + EnvType::Container => ("Container", Color::Blue), + EnvType::Direct => ("Direct", Color::Gray), + }; + let env_name = env.env_name.or(env.image_url).unwrap_or_default(); + lines.push(Line::from(vec![ + Span::styled("Env: ", Style::default().fg(Color::Gray)), + Span::styled( + env_label, + Style::default().fg(env_color).add_modifier(Modifier::BOLD), + ), + if !env_name.is_empty() { + Span::styled(format!(" ({})", env_name), Style::default().fg(Color::Gray)) + } else { + Span::raw("") + }, + ])); + } + + lines.push(Line::from("")); + + // Resources section + lines.push(Line::from(Span::styled( + "Resources", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + + // Partition/Queue + if let Some(ref partition) = job.resources.partition { + lines.push(Line::from(vec![ + Span::styled(" Queue: ", Style::default().fg(Color::Gray)), + Span::styled(partition.clone(), Style::default().fg(Color::Magenta)), + ])); + } + + // Node + if let Some(ref node) = job.resources.node { + lines.push(Line::from(vec![ + Span::styled(" Node: ", Style::default().fg(Color::Gray)), + Span::styled(node.clone(), Style::default().fg(Color::Cyan)), + ])); + } + + // CPUs + if let Some(cpus) = job.resources.cpus { + lines.push(Line::from(vec![ + Span::styled(" CPUs: ", Style::default().fg(Color::Gray)), + Span::styled(cpus.to_string(), Style::default().fg(Color::Green)), + ])); + } + + // Memory + if let Some(mem) = job.resources.memory_mb { + let mem_str = if mem >= 1024 { + format!("{:.1} GB", mem as f64 / 1024.0) + } else { + format!("{} MB", mem) + }; + lines.push(Line::from(vec![ + Span::styled(" Memory: ", Style::default().fg(Color::Gray)), + Span::styled(mem_str, Style::default().fg(Color::Green)), + ])); + } + + // Time limit + if let Some(ref time_limit) = job.resources.time_limit { + lines.push(Line::from(vec![ + Span::styled(" Time Limit: ", Style::default().fg(Color::Gray)), + Span::styled( + format_duration(time_limit), + Style::default().fg(Color::Yellow), + ), + ])); + } + + // Usage section (actual consumption for finished jobs) + if let Some(ref usage) = job.usage { + lines.push(Line::from("")); + lines.push(Line::from(Span::styled( + "Actual Usage", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + + // Max RSS (actual memory used) + if let Some(max_rss) = usage.max_rss_mb { + let mem_str = if max_rss >= 1024 { + format!("{:.1} GB", max_rss as f64 / 1024.0) + } else { + format!("{} MB", max_rss) + }; + // Compare with requested + let efficiency = job.resources.memory_mb.map(|req| { + if req > 0 { + (max_rss as f64 / req as f64 * 100.0) as u32 + } else { + 0 } + }); + let eff_color = match efficiency { + Some(e) if e > 90 => Color::Red, // Near limit + Some(e) if e > 70 => Color::Yellow, // Good utilization + Some(e) if e > 30 => Color::Green, // Moderate + _ => Color::Cyan, // Low utilization + }; + let mut spans = vec![ + Span::styled(" Memory: ", Style::default().fg(Color::Gray)), + Span::styled(mem_str, Style::default().fg(eff_color)), + ]; + if let Some(eff) = efficiency { + spans.push(Span::styled( + format!(" ({}%)", eff), + Style::default().fg(Color::Gray), + )); + } + lines.push(Line::from(spans)); + } - if let Some(ref node) = job.resources.node { - lines.push(format!("Node: {}", node)); + // Elapsed time + if let Some(elapsed) = usage.elapsed_seconds { + let time_str = format_seconds(elapsed); + // Compare with time limit + let efficiency = job.resources.time_limit.map(|limit| { + let limit_secs = limit.as_secs(); + if limit_secs > 0 { + (elapsed as f64 / limit_secs as f64 * 100.0) as u32 + } else { + 0 } + }); + let mut spans = vec![ + Span::styled(" Runtime: ", Style::default().fg(Color::Gray)), + Span::styled(time_str, Style::default().fg(Color::Green)), + ]; + if let Some(eff) = efficiency { + spans.push(Span::styled( + format!(" ({}%)", eff), + Style::default().fg(Color::Gray), + )); + } + lines.push(Line::from(spans)); + } + + // CPU time + if let Some(cpu_time) = usage.cpu_time_seconds { + let time_str = format_seconds(cpu_time); + lines.push(Line::from(vec![ + Span::styled(" CPU Time: ", Style::default().fg(Color::Gray)), + Span::styled(time_str, Style::default().fg(Color::Cyan)), + ])); + } + } + + lines.push(Line::from("")); + + // Timing section + lines.push(Line::from(Span::styled( + "Timing", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + + // Wait time (queued to started) + if let (Some(queued), Some(started)) = (job.timing.queued_at, job.timing.started_at) { + let wait = started - queued; + lines.push(Line::from(vec![ + Span::styled(" Wait: ", Style::default().fg(Color::Gray)), + Span::styled( + format_chrono_duration(&wait), + Style::default().fg(Color::Blue), + ), + ])); + } + + // Runtime + if let Some(started) = job.timing.started_at { + let runtime = if let Some(completed) = job.timing.completed_at { + completed - started + } else { + Utc::now() - started + }; + let runtime_color = if job.status == JobStatus::Running { + Color::Yellow + } else { + Color::Green + }; + lines.push(Line::from(vec![ + Span::styled(" Runtime: ", Style::default().fg(Color::Gray)), + Span::styled( + format_chrono_duration_hms(&runtime), + Style::default().fg(runtime_color), + ), + ])); + } + + // Started at + if let Some(started) = job.timing.started_at { + lines.push(Line::from(vec![ + Span::styled(" Started: ", Style::default().fg(Color::Gray)), + Span::styled( + started.format("%Y-%m-%d %H:%M:%S").to_string(), + Style::default().fg(Color::White), + ), + ])); + } + + // Error section (if failed) + if let Some(ref error) = job.error { + lines.push(Line::from("")); + lines.push(Line::from(Span::styled( + "Error", + Style::default() + .fg(Color::Red) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); - if let Some(cpus) = job.resources.cpus { - lines.push(format!("CPUs: {}", cpus)); + // Show failure analysis if available + if let Some(ref analysis) = error.analysis { + // Failure mode with icon and color + let (mode_icon, mode_text, mode_color) = match analysis.mode { + FailureMode::OutOfMemory => ("⚠", "Out of Memory", Color::Red), + FailureMode::Timeout => ("⏱", "Timeout", Color::Yellow), + FailureMode::ExitCode => ("✗", "Exit Code Error", Color::Red), + FailureMode::Cancelled => ("⊘", "Cancelled", Color::Magenta), + FailureMode::NodeFailure => ("⚡", "Node Failure", Color::LightRed), + FailureMode::Unknown => ("?", "Unknown", Color::Gray), + }; + lines.push(Line::from(vec![ + Span::styled(" Failure: ", Style::default().fg(Color::Gray)), + Span::styled( + format!("{} {}", mode_icon, mode_text), + Style::default().fg(mode_color).add_modifier(Modifier::BOLD), + ), + ])); + + // Memory details for OOM + if analysis.mode == FailureMode::OutOfMemory { + if let (Some(used), Some(limit)) = + (analysis.memory_used_mb, analysis.memory_limit_mb) + { + lines.push(Line::from(vec![ + Span::styled(" Memory: ", Style::default().fg(Color::Gray)), + Span::styled( + format!("{:.1} GB", used as f64 / 1024.0), + Style::default().fg(Color::Red).add_modifier(Modifier::BOLD), + ), + Span::styled(" / ", Style::default().fg(Color::Gray)), + Span::styled( + format!("{:.1} GB limit", limit as f64 / 1024.0), + Style::default().fg(Color::Gray), + ), + ])); } + } - if let Some(mem) = job.resources.memory_mb { - lines.push(format!("Memory: {} MB", mem)); + // Time details for Timeout + if analysis.mode == FailureMode::Timeout { + if let (Some(runtime), Some(limit)) = + (analysis.runtime_seconds, analysis.time_limit_seconds) + { + lines.push(Line::from(vec![ + Span::styled(" Time: ", Style::default().fg(Color::Gray)), + Span::styled( + format_seconds(runtime), + Style::default() + .fg(Color::Yellow) + .add_modifier(Modifier::BOLD), + ), + Span::styled(" / ", Style::default().fg(Color::Gray)), + Span::styled( + format!("{} limit", format_seconds(limit)), + Style::default().fg(Color::Gray), + ), + ])); } + } - lines.join("\n") + // Explanation + if !analysis.explanation.is_empty() { + // Wrap long explanations + let explanation = if analysis.explanation.len() > 45 { + format!("{}...", &analysis.explanation[..42]) + } else { + analysis.explanation.clone() + }; + lines.push(Line::from(vec![ + Span::styled(" ", Style::default()), + Span::styled(explanation, Style::default().fg(Color::White)), + ])); } - None => "No job selected".to_string(), - }; - let paragraph = Paragraph::new(content) - .block(Block::default().borders(Borders::ALL).title("Details")); + // Suggestion (highlighted) + if !analysis.suggestion.is_empty() { + lines.push(Line::from("")); + lines.push(Line::from(Span::styled( + "Suggestion", + Style::default() + .fg(Color::Green) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + // Handle multi-line suggestions + for line in analysis.suggestion.lines().take(3) { + let suggestion_line = if line.len() > 45 { + format!("{}...", &line[..42]) + } else { + line.to_string() + }; + lines.push(Line::from(vec![ + Span::styled(" ", Style::default()), + Span::styled(suggestion_line, Style::default().fg(Color::Green)), + ])); + } + } + } else { + // No analysis available - show basic error info + lines.push(Line::from(vec![ + Span::styled(" Exit Code: ", Style::default().fg(Color::Gray)), + Span::styled( + error.exit_code.to_string(), + Style::default().fg(Color::Red).add_modifier(Modifier::BOLD), + ), + ])); + if !error.message.is_empty() { + // Truncate long error messages + let msg = if error.message.len() > 50 { + format!("{}...", &error.message[..47]) + } else { + error.message.clone() + }; + lines.push(Line::from(vec![ + Span::styled(" Message: ", Style::default().fg(Color::Gray)), + Span::styled(msg, Style::default().fg(Color::Red)), + ])); + } + } + } - frame.render_widget(paragraph, area); + // Output files + if !job.outputs.is_empty() { + lines.push(Line::from("")); + lines.push(Line::from(Span::styled( + "Output", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + for output in job.outputs.iter().take(3) { + let display = if output.len() > 40 { + format!("...{}", &output[output.len() - 37..]) + } else { + output.clone() + }; + lines.push(Line::from(vec![ + Span::styled(" ", Style::default()), + Span::styled(display, Style::default().fg(Color::Gray)), + ])); + } + if job.outputs.len() > 3 { + lines.push(Line::from(Span::styled( + format!(" (+{} more)", job.outputs.len() - 3), + Style::default().fg(Color::Gray), + ))); + } + } + + // Shell command preview + let trimmed_cmd = job.shellcmd.trim(); + if !trimmed_cmd.is_empty() { + lines.push(Line::from("")); + lines.push(Line::from(Span::styled( + "Command", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD | Modifier::UNDERLINED), + ))); + + let all_cmd_lines: Vec<&str> = trimmed_cmd + .lines() + .map(|l| l.trim()) + .filter(|l| !l.is_empty()) + .collect(); + + let total_lines = all_cmd_lines.len(); + + if command_expanded { + // Show all lines, no truncation + for cmd_line in &all_cmd_lines { + lines.push(Line::from(vec![ + Span::styled(" ", Style::default()), + Span::styled(cmd_line.to_string(), Style::default().fg(Color::Gray)), + ])); + } + // Show hint to collapse/copy + lines.push(Line::from(Span::styled( + " ('e' to collapse, 'c' to copy)", + Style::default().fg(Color::DarkGray), + ))); + } else { + // Show first 3 lines with truncation + for cmd_line in all_cmd_lines.iter().take(3) { + let display = if cmd_line.len() > 50 { + format!("{}…", &cmd_line[..49]) + } else { + cmd_line.to_string() + }; + lines.push(Line::from(vec![ + Span::styled(" ", Style::default()), + Span::styled(display, Style::default().fg(Color::Gray)), + ])); + } + + // Indicate if there are more lines + if total_lines > 3 { + lines.push(Line::from(Span::styled( + format!(" (+{} more lines, press 'e' to expand)", total_lines - 3), + Style::default().fg(Color::DarkGray), + ))); + } + } + } + + lines +} + +/// Extract sample name from output path patterns like "results/processed/sample1.txt" +fn extract_sample_from_path(path: &str) -> Option { + // Common patterns: look for sample names between slashes + let parts: Vec<&str> = path.split('/').collect(); + if parts.len() >= 2 { + // Get the filename without extension + if let Some(filename) = parts.last() { + let name = filename.split('.').next().unwrap_or(filename); + // Check if it looks like a sample name (not a generic name) + if !name.is_empty() && name != "output" && name != "result" { + return Some(name.to_string()); + } + } + } + None +} + +fn format_duration(d: &std::time::Duration) -> String { + let secs = d.as_secs(); + let hours = secs / 3600; + let mins = (secs % 3600) / 60; + let secs = secs % 60; + + if hours > 0 { + format!("{:02}:{:02}:{:02}", hours, mins, secs) + } else { + format!("{:02}:{:02}", mins, secs) + } +} + +fn format_chrono_duration(d: &chrono::Duration) -> String { + let secs = d.num_seconds().unsigned_abs(); + let hours = secs / 3600; + let mins = (secs % 3600) / 60; + let secs = secs % 60; + + if hours > 0 { + format!("{:02}:{:02}:{:02}", hours, mins, secs) + } else { + format!("{:02}:{:02}", mins, secs) + } +} + +fn format_chrono_duration_hms(d: &chrono::Duration) -> String { + let secs = d.num_seconds().unsigned_abs(); + let hours = secs / 3600; + let mins = (secs % 3600) / 60; + let secs = secs % 60; + + if hours > 0 { + format!("{}h {}m {}s", hours, mins, secs) + } else if mins > 0 { + format!("{}m {}s", mins, secs) + } else { + format!("{}s", secs) + } +} + +fn format_seconds(secs: u64) -> String { + let hours = secs / 3600; + let mins = (secs % 3600) / 60; + let secs = secs % 60; + + if hours > 0 { + format!("{:02}:{:02}:{:02}", hours, mins, secs) + } else { + format!("{:02}:{:02}", mins, secs) } } diff --git a/crates/charmer-monitor/src/components/job_list.rs b/crates/charmer-monitor/src/components/job_list.rs index dc4af7c..cbc90e7 100644 --- a/crates/charmer-monitor/src/components/job_list.rs +++ b/crates/charmer-monitor/src/components/job_list.rs @@ -1,44 +1,859 @@ -//! Job list component. +//! Job list component with progress indicator and dependency visualization. +use crate::app::ViewMode; +use crate::components::ViewTabs; +use charmer_state::{Job, JobCounts, JobStatus, PipelineState, MAIN_PIPELINE_JOB_ID}; use ratatui::{ - layout::Rect, - style::{Color, Style}, - widgets::{Block, Borders, List, ListItem}, + layout::{Constraint, Direction, Layout, Rect}, + style::{Color, Modifier, Style}, + text::{Line, Span}, + widgets::{ + Block, Borders, Gauge, List, ListItem, ListState, Paragraph, Scrollbar, + ScrollbarOrientation, ScrollbarState, + }, Frame, }; -use charmer_state::{PipelineState, JobStatus}; +use std::collections::{HashMap, HashSet}; + +/// Minimum widths for columns +const MIN_ROW_WIDTH: u16 = 4; +const MIN_STATUS_WIDTH: u16 = 2; +const MIN_RULE_WIDTH: u16 = 12; +const MAX_RULE_WIDTH: u16 = 20; // Cap rule column to prevent excessive width +const MIN_WILDCARDS_WIDTH: u16 = 16; +const MAX_WILDCARDS_WIDTH: u16 = 30; // Give wildcards more room +const RUNTIME_WIDTH: u16 = 6; // Fixed width for runtime (e.g., "1h23m" or "45m12s") +const CHAIN_WIDTH: u16 = 3; // Fixed width for dependency chain indicator + +/// Column visibility thresholds (panel width needed to show column) +const WILDCARDS_THRESHOLD: u16 = 45; +const RUNTIME_THRESHOLD: u16 = 65; + +/// Display options for job list items +struct DisplayOptions { + content_width: u16, + show_wildcards: bool, + show_runtime: bool, +} + +/// Dependency relationship to the selected job +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum DepRelation { + /// This is the selected job + Selected, + /// This job is an upstream dependency (selected depends on this) + Upstream, + /// This job is a downstream dependent (this depends on selected) + Downstream, + /// No relation to selected job + None, +} + +/// Position in the dependency chain for rendering tree connectors +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ChainPosition { + /// First node in chain (top) - uses ┐ + First, + /// Last node in chain (bottom) - uses ┘ + Last, + /// Middle node - uses ┤ + Middle, + /// Not a node, just trunk passing through - uses │ + Trunk, + /// Outside the chain entirely + Outside, +} + +/// Compute dependency relationships for all jobs relative to selected job. +/// Returns (relation, chain_position) for each job. +/// This finds the FULL transitive dependency chain (all ancestors and descendants). +fn compute_dependencies( + state: &PipelineState, + job_ids: &[String], + selected_idx: Option, +) -> Vec<(DepRelation, ChainPosition)> { + let mut relations = vec![(DepRelation::None, ChainPosition::Outside); job_ids.len()]; + + let Some(sel_idx) = selected_idx else { + return relations; + }; + + let Some(selected_id) = job_ids.get(sel_idx) else { + return relations; + }; + + // Skip if main pipeline job is selected + if selected_id == MAIN_PIPELINE_JOB_ID { + return relations; + } + + if !state.jobs.contains_key(selected_id) { + return relations; + } + + // Mark selected job + relations[sel_idx].0 = DepRelation::Selected; + + // Build output->job_id map for finding upstream dependencies + let mut output_to_job: HashMap<&str, &str> = HashMap::new(); + for (job_id, job) in &state.jobs { + for output in &job.outputs { + output_to_job.insert(output.as_str(), job_id.as_str()); + } + } + + // Build input->job_ids map for finding downstream dependencies + let mut input_to_jobs: HashMap<&str, Vec<&str>> = HashMap::new(); + for (job_id, job) in &state.jobs { + for input in &job.inputs { + input_to_jobs + .entry(input.as_str()) + .or_default() + .push(job_id.as_str()); + } + } + + // Find all transitive upstream dependencies (ancestors) + let mut upstream_ids: HashSet<&str> = HashSet::new(); + let mut to_visit: Vec<&str> = vec![selected_id.as_str()]; + let mut visited: HashSet<&str> = HashSet::new(); + + while let Some(current_id) = to_visit.pop() { + if visited.contains(current_id) { + continue; + } + visited.insert(current_id); + + if let Some(job) = state.jobs.get(current_id) { + for input in &job.inputs { + if let Some(&parent_id) = output_to_job.get(input.as_str()) { + if parent_id != selected_id.as_str() { + upstream_ids.insert(parent_id); + } + if !visited.contains(parent_id) { + to_visit.push(parent_id); + } + } + } + } + } + + // Find all transitive downstream dependencies (descendants) + let mut downstream_ids: HashSet<&str> = HashSet::new(); + let mut to_visit: Vec<&str> = vec![selected_id.as_str()]; + let mut visited: HashSet<&str> = HashSet::new(); + + while let Some(current_id) = to_visit.pop() { + if visited.contains(current_id) { + continue; + } + visited.insert(current_id); + + if let Some(job) = state.jobs.get(current_id) { + // Find jobs that consume this job's outputs + for output in &job.outputs { + if let Some(child_ids) = input_to_jobs.get(output.as_str()) { + for &child_id in child_ids { + if child_id != selected_id.as_str() { + downstream_ids.insert(child_id); + } + if !visited.contains(child_id) { + to_visit.push(child_id); + } + } + } + } + } + } + + // Mark relationships and collect chain member indices + let mut chain_indices: Vec = vec![sel_idx]; + for (idx, job_id) in job_ids.iter().enumerate() { + if idx == sel_idx { + continue; + } + if upstream_ids.contains(job_id.as_str()) { + relations[idx].0 = DepRelation::Upstream; + chain_indices.push(idx); + } else if downstream_ids.contains(job_id.as_str()) { + relations[idx].0 = DepRelation::Downstream; + chain_indices.push(idx); + } + } + + // Determine chain positions for tree rendering + if chain_indices.len() > 1 { + chain_indices.sort(); + let min_idx = chain_indices[0]; + let max_idx = chain_indices[chain_indices.len() - 1]; + + // Mark positions for all rows in the range + #[allow(clippy::needless_range_loop)] + for idx in min_idx..=max_idx { + if relations[idx].0 != DepRelation::None { + // This is an actual chain member + if idx == min_idx { + relations[idx].1 = ChainPosition::First; + } else if idx == max_idx { + relations[idx].1 = ChainPosition::Last; + } else { + relations[idx].1 = ChainPosition::Middle; + } + } else { + // Trunk passes through + relations[idx].1 = ChainPosition::Trunk; + } + } + } else if chain_indices.len() == 1 { + // Only selected job, no dependencies to show + relations[sel_idx].1 = ChainPosition::Outside; + } + + relations +} pub struct JobList; impl JobList { - pub fn render(frame: &mut Frame, area: Rect, state: &PipelineState, selected: Option) { - let items: Vec = state - .jobs - .values() - .enumerate() - .map(|(i, job)| { - let style = match job.status { - JobStatus::Running => Style::default().fg(Color::Yellow), - JobStatus::Completed => Style::default().fg(Color::Green), - JobStatus::Failed => Style::default().fg(Color::Red), - _ => Style::default(), - }; + /// Render the job list using filtered job IDs. + pub fn render( + frame: &mut Frame, + area: Rect, + state: &PipelineState, + filtered_job_ids: &[String], + selected: Option, + filter_label: &str, + sort_label: &str, + ) { + let counts = state.job_counts(); + + // Calculate visible job count (exclude main pipeline pseudo-job) + let visible = filtered_job_ids + .iter() + .filter(|id| id.as_str() != MAIN_PIPELINE_JOB_ID) + .count(); + + // Split area: progress bar on top, column headers, list below + let chunks = Layout::default() + .direction(Direction::Vertical) + .constraints([ + Constraint::Length(2), // Progress header + Constraint::Length(1), // Column headers + Constraint::Min(1), // Job list + ]) + .split(area); + + // Render progress header + render_progress_header( + frame, + chunks[0], + &counts, + visible, + state.total_jobs, + filter_label, + sort_label, + ); + + // Calculate available width for content (minus borders) + let content_width = chunks[1].width.saturating_sub(2); + + // Determine which columns to show based on width + let opts = DisplayOptions { + content_width, + show_wildcards: content_width >= WILDCARDS_THRESHOLD, + show_runtime: content_width >= RUNTIME_THRESHOLD, + }; - let wildcards = job.wildcards.as_deref().unwrap_or(""); - let label = if wildcards.is_empty() { - job.rule.clone() + // Render column headers + render_column_headers(frame, chunks[1], &opts); + + // Compute dependency relationships for visual indicator + let deps = compute_dependencies(state, filtered_job_ids, selected); + + // Build job list items with responsive columns + // Track display row number separately (main pipeline job doesn't get a number) + let mut display_row = 0usize; + let items: Vec = filtered_job_ids + .iter() + .enumerate() + .map(|(i, job_id)| { + let row_num = if job_id == MAIN_PIPELINE_JOB_ID { + 0 // Main pipeline uses special display, row num not shown } else { - format!("{}[{}]", job.rule, wildcards) + display_row += 1; + display_row }; - - ListItem::new(format!("{} {}", job.status.symbol(), label)).style(style) + let (relation, chain_pos) = deps[i]; + build_job_item( + row_num, i, job_id, state, &counts, selected, &opts, relation, chain_pos, + ) }) .collect(); let list = List::new(items) - .block(Block::default().borders(Borders::ALL).title("Jobs")) - .highlight_style(Style::default().bg(Color::DarkGray)); + .block(Block::default().borders(Borders::LEFT | Borders::RIGHT | Borders::BOTTOM)); + + let mut list_state = ListState::default(); + list_state.select(selected); + + frame.render_stateful_widget(list, chunks[2], &mut list_state); + + // Render scrollbar if there are more items than visible + let list_height = chunks[2].height.saturating_sub(2) as usize; // minus borders + if filtered_job_ids.len() > list_height { + let mut scrollbar_state = ScrollbarState::new(filtered_job_ids.len()) + .position(selected.unwrap_or(0)) + .viewport_content_length(list_height); + + let scrollbar = Scrollbar::new(ScrollbarOrientation::VerticalRight) + .begin_symbol(Some("↑")) + .end_symbol(Some("↓")) + .track_symbol(Some("│")) + .thumb_symbol("█"); + + // Use full panel area with 2-space inset from top and bottom + let scrollbar_area = Rect { + x: area.x, + y: area.y + 2, + width: area.width, + height: area.height.saturating_sub(4), // 2 top + 2 bottom + }; + + frame.render_stateful_widget(scrollbar, scrollbar_area, &mut scrollbar_state); + } + } +} + +/// Build a single job list item with responsive columns. +#[allow(clippy::too_many_arguments)] +fn build_job_item( + row_num: usize, + list_index: usize, + job_id: &str, + state: &PipelineState, + counts: &JobCounts, + selected: Option, + opts: &DisplayOptions, + dep_relation: DepRelation, + chain_pos: ChainPosition, +) -> ListItem<'static> { + // Handle main pipeline job specially + if job_id == MAIN_PIPELINE_JOB_ID { + return build_main_pipeline_item(state, counts, selected == Some(list_index)); + } + + // Regular job + let Some(job) = state.jobs.get(job_id) else { + return ListItem::new(Line::from(Span::raw("???"))); + }; + + let is_selected = selected == Some(list_index); + let status_style = get_status_style(job.status); + + // Extract wildcards for colored display + let wildcards = extract_wildcards(job); + + // Calculate column widths + // Layout: # | Status | Rule | Wildcards | Runtime | Chain + // Chain is always at far right (fixed 3 chars) + let fixed_width = MIN_ROW_WIDTH + MIN_STATUS_WIDTH + CHAIN_WIDTH; + let mut remaining = opts.content_width.saturating_sub(fixed_width); + + // Reserve fixed width for runtime (rightmost before chain) + let runtime_width = if opts.show_runtime { + remaining = remaining.saturating_sub(RUNTIME_WIDTH + 1); // +1 for separator + RUNTIME_WIDTH + } else { + 0 + }; + + // Reserve space for wildcards column (generous width) + let wildcards_width = if opts.show_wildcards { + let w = remaining + .saturating_sub(MAX_RULE_WIDTH + 1) // leave room for rule + .clamp(MIN_WILDCARDS_WIDTH, MAX_WILDCARDS_WIDTH); + remaining = remaining.saturating_sub(w + 1); // +1 for separator + w + } else { + 0 + }; + + // Rule gets remaining space, capped at MAX_RULE_WIDTH + let rule_width = remaining.clamp(MIN_RULE_WIDTH, MAX_RULE_WIDTH) as usize; + + // Build spans + let mut spans = Vec::new(); + + // Row number (highlighted when selected) + let row_style = if is_selected { + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD) + } else { + Style::default().fg(Color::Gray) + }; + spans.push(Span::styled(format!("{:3} ", row_num), row_style)); + + // Status symbol (highlighted when selected) + // Use 🎯 for target rules (like "all"), otherwise use status symbol + let status_symbol = if job.is_target { + "🎯" + } else { + job.status.symbol() + }; + let status_display_style = if is_selected { + status_style.add_modifier(Modifier::BOLD) + } else { + status_style + }; + spans.push(Span::styled( + format!("{} ", status_symbol), + status_display_style, + )); + + // Rule name (takes available space, truncates if needed) + let rule_display = truncate_str(&job.rule, rule_width); + let rule_style = if is_selected { + status_style.add_modifier(Modifier::BOLD) + } else { + status_style + }; + spans.push(Span::styled( + format!("{: = Vec::new(); + let mut total_len = 0usize; + let max_len = wildcards_width as usize; + + for (i, value) in wildcards.iter().enumerate() { + if i > 0 { + // Add pipe separator + if total_len < max_len { + wildcard_spans.push(Span::styled("|", sep_style)); + total_len += 1; + } else { + break; + } + } + + // Get color for this wildcard (cycle through palette) + let base_color = WILDCARD_COLORS[i % WILDCARD_COLORS.len()]; + let style = if is_selected { + Style::default().fg(base_color).add_modifier(Modifier::BOLD) + } else { + Style::default().fg(base_color) + }; + + // Truncate value if needed + let remaining_space = max_len.saturating_sub(total_len); + if remaining_space == 0 { + break; + } + let display_value = if value.len() <= remaining_space { + value.clone() + } else if remaining_space > 1 { + format!("{}…", &value[..remaining_space - 1]) + } else { + "…".to_string() + }; + total_len += display_value.len(); + wildcard_spans.push(Span::styled(display_value, style)); + } + + // Pad to column width + let padding = max_len.saturating_sub(total_len); + if padding > 0 { + wildcard_spans.push(Span::raw(" ".repeat(padding))); + } + + spans.extend(wildcard_spans); + } + + // Runtime column (fixed width, right-aligned) + if opts.show_runtime { + let sep_style = if is_selected { + Style::default().fg(Color::Gray) + } else { + Style::default().fg(Color::DarkGray) + }; + spans.push(Span::styled(" │ ", sep_style)); + + let runtime = get_job_runtime(job); + let runtime_style = if is_selected { + Style::default() + .fg(Color::Yellow) + .add_modifier(Modifier::BOLD) + } else { + Style::default().fg(Color::Yellow) + }; + spans.push(Span::styled( + format!("{:>width$}", runtime, width = runtime_width as usize), + runtime_style, + )); + } + + // Dependency tree indicator (always at far right, fixed width) + // Format: ○─┐ (first), ○─┤ (middle), ○─┘ (last), or just │ (trunk) + // Dot style: ○ pending, ● completed, ◐ running + let tree_style = Style::default().fg(Color::White); + + // Choose dot based on job status + let status_dot = match job.status { + JobStatus::Running => "◐", + JobStatus::Completed => "●", + _ => "○", // Pending, Queued, Failed, etc. + }; + + let dep_indicator: Vec = match chain_pos { + ChainPosition::First => { + let dot_style = match dep_relation { + DepRelation::Upstream => Style::default().fg(Color::Cyan), + DepRelation::Selected => Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + DepRelation::Downstream => Style::default().fg(Color::Magenta), + DepRelation::None => tree_style, + }; + vec![ + Span::styled(status_dot, dot_style), + Span::styled("─┐", tree_style), + ] + } + ChainPosition::Middle => { + let dot_style = match dep_relation { + DepRelation::Upstream => Style::default().fg(Color::Cyan), + DepRelation::Selected => Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + DepRelation::Downstream => Style::default().fg(Color::Magenta), + DepRelation::None => tree_style, + }; + vec![ + Span::styled(status_dot, dot_style), + Span::styled("─┤", tree_style), + ] + } + ChainPosition::Last => { + let dot_style = match dep_relation { + DepRelation::Upstream => Style::default().fg(Color::Cyan), + DepRelation::Selected => Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + DepRelation::Downstream => Style::default().fg(Color::Magenta), + DepRelation::None => tree_style, + }; + vec![ + Span::styled(status_dot, dot_style), + Span::styled("─┘", tree_style), + ] + } + ChainPosition::Trunk => { + vec![Span::styled(" │", tree_style)] + } + ChainPosition::Outside => { + vec![Span::raw(" ")] + } + }; + spans.extend(dep_indicator); + + ListItem::new(Line::from(spans)) +} + +/// Get runtime string for a job. +fn get_job_runtime(job: &Job) -> String { + use chrono::Utc; + + if let Some(started) = job.timing.started_at { + let elapsed = if let Some(completed) = job.timing.completed_at { + completed - started + } else { + Utc::now() - started + }; + + let secs = elapsed.num_seconds().unsigned_abs(); + let mins = secs / 60; + let secs = secs % 60; + + if mins >= 60 { + let hours = mins / 60; + let mins = mins % 60; + format!("{}h{}m", hours, mins) + } else if mins > 0 { + format!("{}m{}s", mins, secs) + } else { + format!("{}s", secs) + } + } else { + "-".to_string() + } +} + +/// Build the main pipeline job item. +fn build_main_pipeline_item( + state: &PipelineState, + counts: &JobCounts, + is_selected: bool, +) -> ListItem<'static> { + let status_symbol = if state.pipeline_finished { + "✓" + } else if !state.pipeline_errors.is_empty() { + "✗" + } else { + "▶" + }; + + let status_color = if state.pipeline_finished { + Color::Green + } else if !state.pipeline_errors.is_empty() { + Color::Red + } else { + Color::Cyan + }; + + let label = if let Some(total) = state.total_jobs { + format!("snakemake ({}/{})", counts.completed, total) + } else { + "snakemake (main log)".to_string() + }; + + let mut item_style = Style::default() + .fg(Color::Cyan) + .add_modifier(Modifier::BOLD); + if is_selected { + item_style = item_style.add_modifier(Modifier::REVERSED); + } + + ListItem::new(Line::from(vec![ + Span::styled(" - ", Style::default().fg(Color::DarkGray)), + Span::styled( + format!("{} ", status_symbol), + Style::default().fg(status_color), + ), + Span::styled(label, item_style), + ])) +} + +/// Get the style for a job status. +fn get_status_style(status: JobStatus) -> Style { + match status { + JobStatus::Running => Style::default().fg(Color::Yellow), + JobStatus::Completed => Style::default().fg(Color::Green), + JobStatus::Failed => Style::default().fg(Color::Red), + JobStatus::Queued => Style::default().fg(Color::Blue), + JobStatus::Pending => Style::default().fg(Color::White), + JobStatus::Cancelled => Style::default().fg(Color::Magenta), + JobStatus::Unknown => Style::default().fg(Color::DarkGray), + } +} + +/// Extract wildcards as separate values for colored display. +fn extract_wildcards(job: &Job) -> Vec { + let Some(wildcards) = &job.wildcards else { + return Vec::new(); + }; + + // Parse wildcards like "sample=sample1, chrom=chr1" + // Return each value separately for colored rendering + wildcards + .split(',') + .filter_map(|part| { + part.trim() + .split_once('=') + .map(|(_, value)| value.trim().to_string()) + }) + .collect() +} + +/// Color palette for wildcard values. +const WILDCARD_COLORS: [Color; 6] = [ + Color::Cyan, + Color::Magenta, + Color::Yellow, + Color::Green, + Color::Blue, + Color::Red, +]; + +/// Truncate a string to fit within a given width. +fn truncate_str(s: &str, max_width: usize) -> String { + if s.len() <= max_width { + s.to_string() + } else if max_width <= 1 { + "…".to_string() + } else { + format!("{}…", &s[..max_width - 1]) + } +} + +/// Render column headers for the job list. +fn render_column_headers(frame: &mut Frame, area: Rect, opts: &DisplayOptions) { + let header_style = Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD); + let sep_style = Style::default().fg(Color::DarkGray); + + // Calculate column widths (same logic as build_job_item) + let fixed_width = MIN_ROW_WIDTH + MIN_STATUS_WIDTH + CHAIN_WIDTH; + let mut remaining = opts.content_width.saturating_sub(fixed_width); + + let runtime_width = if opts.show_runtime { + remaining = remaining.saturating_sub(RUNTIME_WIDTH + 1); + RUNTIME_WIDTH + } else { + 0 + }; + + let wildcards_width = if opts.show_wildcards { + let w = remaining + .saturating_sub(MAX_RULE_WIDTH + 1) + .clamp(MIN_WILDCARDS_WIDTH, MAX_WILDCARDS_WIDTH); + remaining = remaining.saturating_sub(w + 1); + w + } else { + 0 + }; + + let rule_width = remaining.clamp(MIN_RULE_WIDTH, MAX_RULE_WIDTH) as usize; + + // Build header spans + let mut spans = Vec::new(); + + // Row number column header + spans.push(Span::styled(" # ", header_style)); + + // Status column header (just a symbol placeholder) + spans.push(Span::styled("○ ", header_style)); + + // Rule column header + spans.push(Span::styled( + format!("{: 0 { + spans.push(Span::raw(" ".repeat(padding))); + } + } + + // Runtime column header + if opts.show_runtime { + spans.push(Span::styled(" │ ", sep_style)); + spans.push(Span::styled( + format!("{:>width$}", "Time", width = runtime_width as usize), + header_style, + )); } + + // Chain column - just space (no header text) + spans.push(Span::raw(" ")); + + let header_line = Line::from(spans); + let paragraph = + Paragraph::new(header_line).block(Block::default().borders(Borders::LEFT | Borders::RIGHT)); + + frame.render_widget(paragraph, area); +} + +/// Render a progress header with inline progress bar. +fn render_progress_header( + frame: &mut Frame, + area: Rect, + counts: &JobCounts, + _visible: usize, + total_jobs: Option, + filter_label: &str, + sort_label: &str, +) { + // Prefer total_jobs from snakemake log (more accurate) over counted jobs + let total = total_jobs.unwrap_or(counts.total); + + // Use tabs as title + let tabs_title = ViewTabs::title_line(ViewMode::Jobs); + + let block = Block::default() + .borders(Borders::TOP | Borders::LEFT | Borders::RIGHT) + .title(tabs_title); + + // Calculate inner area for the gauge layout + let inner = block.inner(area); + frame.render_widget(block, area); + + // Layout: Filter/Sort | Gauge | (count) + // Calculate width for filter/sort section + let filter_sort_width = 8 + filter_label.len() + 7 + sort_label.len() + 2; // "Filter:" + label + " Sort:" + label + padding + let count_text = format!("({}/{})", counts.completed, total); + + let chunks = Layout::default() + .direction(Direction::Horizontal) + .constraints([ + Constraint::Length(filter_sort_width as u16), // Filter/Sort with padding + Constraint::Min(1), // Gauge fills remaining + Constraint::Length(count_text.len() as u16 + 1), // +1 for padding + ]) + .split(inner); + + // Filter/Sort label on left with colored values + let filter_sort = Paragraph::new(Line::from(vec![ + Span::styled(" Filter:", Style::default().fg(Color::DarkGray)), + Span::styled(filter_label, Style::default().fg(Color::Cyan)), + Span::styled(" Sort:", Style::default().fg(Color::DarkGray)), + Span::styled( + format!("{} ", sort_label), + Style::default().fg(Color::Yellow), + ), + ])); + frame.render_widget(filter_sort, chunks[0]); + + // Gauge in middle + let ratio = if total > 0 { + counts.completed as f64 / total as f64 + } else { + 0.0 + }; + + let gauge = Gauge::default() + .gauge_style(Style::default().fg(Color::Green).bg(Color::DarkGray)) + .ratio(ratio.min(1.0)); + frame.render_widget(gauge, chunks[1]); + + // Count on right + let count = Paragraph::new(Line::from(Span::styled( + format!("{} ", count_text), + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + ))); + frame.render_widget(count, chunks[2]); } diff --git a/crates/charmer-monitor/src/components/log_viewer.rs b/crates/charmer-monitor/src/components/log_viewer.rs new file mode 100644 index 0000000..ff72026 --- /dev/null +++ b/crates/charmer-monitor/src/components/log_viewer.rs @@ -0,0 +1,279 @@ +//! Log viewer component for displaying job log files. + +use ratatui::{ + layout::Rect, + style::{Color, Style}, + text::{Line, Span}, + widgets::{Block, Borders, Paragraph, Scrollbar, ScrollbarOrientation, ScrollbarState, Wrap}, + Frame, +}; +use std::fs; +use std::io; +use std::path::Path; + +/// State for the log viewer component. +#[derive(Debug, Clone)] +pub struct LogViewerState { + /// Path to the log file being viewed + pub log_path: String, + /// Content lines from the log file + pub lines: Vec, + /// Current scroll offset (0-indexed line number) + pub scroll_offset: usize, + /// Follow mode - auto-scroll to end + pub follow_mode: bool, + /// Error message if log couldn't be loaded + pub error: Option, +} + +impl LogViewerState { + /// Create a new log viewer state by loading the specified log file. + pub fn new(log_path: String, tail_lines: usize) -> Self { + let (lines, error) = match load_log_file(&log_path, tail_lines) { + Ok(lines) => (lines, None), + Err(e) => (vec![], Some(format!("Error loading log: {}", e))), + }; + + let scroll_offset = if !lines.is_empty() && tail_lines > 0 { + lines.len().saturating_sub(1) + } else { + 0 + }; + + Self { + log_path, + lines, + scroll_offset, + follow_mode: false, + error, + } + } + + /// Scroll down by one line. + pub fn scroll_down(&mut self) { + if !self.lines.is_empty() { + self.scroll_offset = (self.scroll_offset + 1).min(self.lines.len().saturating_sub(1)); + } + self.follow_mode = false; + } + + /// Scroll up by one line. + pub fn scroll_up(&mut self) { + self.scroll_offset = self.scroll_offset.saturating_sub(1); + self.follow_mode = false; + } + + /// Scroll to the top. + pub fn scroll_to_top(&mut self) { + self.scroll_offset = 0; + self.follow_mode = false; + } + + /// Scroll to the bottom. + pub fn scroll_to_bottom(&mut self) { + if !self.lines.is_empty() { + self.scroll_offset = self.lines.len().saturating_sub(1); + } + self.follow_mode = false; + } + + /// Toggle follow mode. + pub fn toggle_follow(&mut self) { + self.follow_mode = !self.follow_mode; + if self.follow_mode { + self.scroll_to_bottom(); + } + } + + /// Get the visible lines for the given viewport height. + pub fn visible_lines(&self, viewport_height: usize) -> &[String] { + if self.lines.is_empty() { + return &[]; + } + + let start = self.scroll_offset; + let end = (start + viewport_height).min(self.lines.len()); + &self.lines[start..end] + } + + /// Get scroll position information. + pub fn scroll_info(&self) -> String { + if self.lines.is_empty() { + return "0/0".to_string(); + } + format!("{}/{}", self.scroll_offset + 1, self.lines.len()) + } +} + +/// Load log file contents, optionally tailing the last N lines. +fn load_log_file(path: &str, tail_lines: usize) -> io::Result> { + let path = Path::new(path); + + if !path.exists() { + return Err(io::Error::new( + io::ErrorKind::NotFound, + format!("Log file not found: {}", path.display()), + )); + } + + let content = fs::read_to_string(path)?; + let lines: Vec = content.lines().map(String::from).collect(); + + // If tail_lines is specified and we have more lines, take the last N + if tail_lines > 0 && lines.len() > tail_lines { + let skip_count = lines.len() - tail_lines; + Ok(lines.into_iter().skip(skip_count).collect()) + } else { + Ok(lines) + } +} + +/// Log viewer component. +pub struct LogViewer; + +impl LogViewer { + /// Render the log viewer component. + pub fn render(frame: &mut Frame, area: Rect, state: &LogViewerState) { + // Calculate content area (excluding borders) + let content_height = area.height.saturating_sub(3); // Borders + footer + + // Build the title with the log file path + let title = format!(" Log: {} ", state.log_path); + + // Get visible lines + let visible_lines = state.visible_lines(content_height as usize); + + // Build content + let content = if let Some(ref error) = state.error { + // Show error message + vec![Line::from(vec![Span::styled( + error.clone(), + Style::default().fg(Color::Red), + )])] + } else if visible_lines.is_empty() { + // Show empty message + vec![Line::from(vec![Span::styled( + "Log file is empty", + Style::default().fg(Color::DarkGray), + )])] + } else { + // Show log lines + visible_lines + .iter() + .map(|line| Line::from(vec![Span::raw(line.as_str())])) + .collect() + }; + + // Build footer with scroll info and follow indicator + let scroll_info = state.scroll_info(); + let follow_indicator = if state.follow_mode { " [follow]" } else { "" }; + let footer_text = format!(" {}{} ", scroll_info, follow_indicator); + + // Create the paragraph with borders + let block = Block::default() + .borders(Borders::ALL) + .title(title) + .title_bottom(footer_text); + + let paragraph = Paragraph::new(content) + .block(block) + .wrap(Wrap { trim: false }); + + frame.render_widget(paragraph, area); + + // Render scrollbar if content exceeds viewport + if state.lines.len() > content_height as usize { + let mut scrollbar_state = + ScrollbarState::new(state.lines.len()).position(state.scroll_offset); + + let scrollbar = Scrollbar::new(ScrollbarOrientation::VerticalRight) + .begin_symbol(Some("↑")) + .end_symbol(Some("↓")) + .track_symbol(Some("│")) + .thumb_symbol("█"); + + frame.render_stateful_widget(scrollbar, area, &mut scrollbar_state); + } + } + + /// Render the log viewer footer with keybindings. + pub fn render_footer(frame: &mut Frame, area: Rect) { + let help = "j/k:scroll g/G:top/bottom F:follow q/Esc:close"; + let paragraph = Paragraph::new(help).style(Style::default().fg(Color::DarkGray)); + frame.render_widget(paragraph, area); + } + + /// Render the log viewer as a bottom panel showing tailed output. + pub fn render_panel(frame: &mut Frame, area: Rect, state: &LogViewerState) { + // Calculate content area (excluding borders) + let content_height = area.height.saturating_sub(2) as usize; // Top border + title + + // Build the title with log path and follow indicator + let follow_indicator = if state.follow_mode { " [follow]" } else { "" }; + let title = format!(" Logs: {}{} ", state.log_path, follow_indicator); + + // Get the last N lines to show (tail view) + let tail_lines = if state.lines.is_empty() { + &[][..] + } else { + let start = state.lines.len().saturating_sub(content_height); + &state.lines[start..] + }; + + // Build content + let content: Vec = if let Some(ref error) = state.error { + // Show error message + vec![Line::from(vec![Span::styled( + error.clone(), + Style::default().fg(Color::Red), + )])] + } else if tail_lines.is_empty() { + // Show empty message + vec![Line::from(vec![Span::styled( + "(waiting for log output...)", + Style::default().fg(Color::DarkGray), + )])] + } else { + // Show tailed log lines with syntax highlighting for common patterns + tail_lines + .iter() + .map(|line| { + let style = if line.contains("ERROR") || line.contains("Error") { + Style::default().fg(Color::Red) + } else if line.contains("WARN") || line.contains("Warning") { + Style::default().fg(Color::Yellow) + } else if line.contains("INFO") || line.contains("rule ") { + Style::default().fg(Color::Cyan) + } else { + Style::default().fg(Color::White) + }; + Line::from(vec![Span::styled(line.as_str(), style)]) + }) + .collect() + }; + + // Create the block with border + let block = Block::default() + .borders(Borders::ALL) + .title(title) + .title_style(Style::default().fg(Color::Cyan)); + + let paragraph = Paragraph::new(content).block(block); + + frame.render_widget(paragraph, area); + + // Render scrollbar for panel if content exceeds viewport + if state.lines.len() > content_height { + let scroll_pos = state.lines.len().saturating_sub(content_height); + let mut scrollbar_state = ScrollbarState::new(state.lines.len()).position(scroll_pos); + + let scrollbar = Scrollbar::new(ScrollbarOrientation::VerticalRight) + .begin_symbol(Some("↑")) + .end_symbol(Some("↓")) + .track_symbol(Some("│")) + .thumb_symbol("█"); + + frame.render_stateful_widget(scrollbar, area, &mut scrollbar_state); + } + } +} diff --git a/crates/charmer-monitor/src/components/mod.rs b/crates/charmer-monitor/src/components/mod.rs index 817e1c3..1d382be 100644 --- a/crates/charmer-monitor/src/components/mod.rs +++ b/crates/charmer-monitor/src/components/mod.rs @@ -1,11 +1,17 @@ //! TUI components. +pub mod footer; pub mod header; -pub mod job_list; pub mod job_detail; -pub mod footer; +pub mod job_list; +pub mod log_viewer; +pub mod rule_summary; +pub mod view_tabs; +pub use footer::Footer; pub use header::Header; -pub use job_list::JobList; pub use job_detail::JobDetail; -pub use footer::Footer; +pub use job_list::JobList; +pub use log_viewer::{LogViewer, LogViewerState}; +pub use rule_summary::RuleSummary; +pub use view_tabs::ViewTabs; diff --git a/crates/charmer-monitor/src/components/rule_summary.rs b/crates/charmer-monitor/src/components/rule_summary.rs new file mode 100644 index 0000000..a0135b5 --- /dev/null +++ b/crates/charmer-monitor/src/components/rule_summary.rs @@ -0,0 +1,241 @@ +//! Rule summary component showing aggregated statistics per rule. + +use crate::app::ViewMode; +use crate::components::ViewTabs; +use charmer_state::{JobStatus, PipelineState}; +use ratatui::{ + layout::{Constraint, Rect}, + style::{Color, Modifier, Style}, + text::Span, + widgets::{ + Block, Borders, Row, Scrollbar, ScrollbarOrientation, ScrollbarState, Table, TableState, + }, + Frame, +}; + +/// Statistics for a single rule. +#[derive(Debug, Default)] +pub struct RuleStats { + pub total: usize, + pub running: usize, + pub completed: usize, + pub failed: usize, + pub pending: usize, + pub total_runtime_secs: u64, +} + +impl RuleStats { + /// Calculate average runtime in seconds. + pub fn avg_runtime_secs(&self) -> Option { + if self.completed > 0 { + Some(self.total_runtime_secs / self.completed as u64) + } else { + None + } + } +} + +pub struct RuleSummary; + +impl RuleSummary { + /// Render the rule summary table. + pub fn render( + frame: &mut Frame, + area: Rect, + state: &PipelineState, + rule_names: &[String], + selected: Option, + ) { + // Calculate stats for each rule + let stats: Vec<(&String, RuleStats)> = rule_names + .iter() + .map(|rule| { + let job_ids = state.jobs_by_rule.get(rule); + let mut stats = RuleStats::default(); + + if let Some(ids) = job_ids { + for id in ids { + if let Some(job) = state.jobs.get(id) { + stats.total += 1; + match job.status { + JobStatus::Running => stats.running += 1, + JobStatus::Completed => { + stats.completed += 1; + // Calculate runtime + if let (Some(start), Some(end)) = + (job.timing.started_at, job.timing.completed_at) + { + let runtime = (end - start).num_seconds().max(0) as u64; + stats.total_runtime_secs += runtime; + } + } + JobStatus::Failed => stats.failed += 1, + JobStatus::Pending | JobStatus::Queued => stats.pending += 1, + _ => {} + } + } + } + } + (rule, stats) + }) + .collect(); + + // Build table rows + let rows: Vec = stats + .iter() + .enumerate() + .map(|(i, (rule, s))| { + let is_selected = selected == Some(i); + let base_style = if is_selected { + Style::default().add_modifier(Modifier::BOLD) + } else { + Style::default() + }; + + // Format average runtime + let avg_time = match s.avg_runtime_secs() { + Some(secs) => format_duration(secs), + None => "-".to_string(), + }; + + // Progress bar for this rule + let progress = if s.total > 0 { + format!("{}%", s.completed * 100 / s.total) + } else { + "-".to_string() + }; + + Row::new(vec![ + Span::styled((*rule).clone(), base_style.fg(Color::Cyan)), + Span::styled(s.total.to_string(), base_style.fg(Color::White)), + Span::styled( + s.running.to_string(), + base_style.fg(if s.running > 0 { + Color::Yellow + } else { + Color::Gray + }), + ), + Span::styled( + s.completed.to_string(), + base_style.fg(if s.completed > 0 { + Color::Green + } else { + Color::Gray + }), + ), + Span::styled( + s.failed.to_string(), + base_style.fg(if s.failed > 0 { + Color::Red + } else { + Color::Gray + }), + ), + Span::styled(avg_time, base_style.fg(Color::Yellow)), + Span::styled(progress, base_style.fg(Color::White)), + ]) + }) + .collect(); + + // Build header + let header = Row::new(vec![ + Span::styled( + "Rule", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + ), + Span::styled( + "Total", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + ), + Span::styled( + "Run", + Style::default() + .fg(Color::Yellow) + .add_modifier(Modifier::BOLD), + ), + Span::styled( + "Done", + Style::default() + .fg(Color::Green) + .add_modifier(Modifier::BOLD), + ), + Span::styled( + "Fail", + Style::default().fg(Color::Red).add_modifier(Modifier::BOLD), + ), + Span::styled( + "Avg Time", + Style::default() + .fg(Color::Yellow) + .add_modifier(Modifier::BOLD), + ), + Span::styled( + "Progress", + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + ), + ]) + .style(Style::default().add_modifier(Modifier::UNDERLINED)); + + // Use tabs as title + let title = ViewTabs::title_line(ViewMode::Rules); + + let table = Table::new( + rows, + [ + Constraint::Min(15), // Rule + Constraint::Length(6), // Total + Constraint::Length(5), // Running + Constraint::Length(5), // Done + Constraint::Length(5), // Failed + Constraint::Length(10), // Avg Time + Constraint::Length(10), // Progress + ], + ) + .header(header) + .block(Block::default().borders(Borders::ALL).title(title)) + // Text emphasis only, no background - matches job list + .row_highlight_style(Style::default().add_modifier(Modifier::BOLD)); + + let mut table_state = TableState::default(); + table_state.select(selected); + + frame.render_stateful_widget(table, area, &mut table_state); + + // Render scrollbar if needed + let table_height = area.height.saturating_sub(3) as usize; // header + borders + if rule_names.len() > table_height { + let mut scrollbar_state = + ScrollbarState::new(rule_names.len()).position(selected.unwrap_or(0)); + + let scrollbar = Scrollbar::new(ScrollbarOrientation::VerticalRight) + .begin_symbol(Some("↑")) + .end_symbol(Some("↓")) + .track_symbol(Some("│")) + .thumb_symbol("█"); + + frame.render_stateful_widget(scrollbar, area, &mut scrollbar_state); + } + } +} + +/// Format seconds as human-readable duration. +fn format_duration(secs: u64) -> String { + if secs >= 3600 { + let hours = secs / 3600; + let mins = (secs % 3600) / 60; + format!("{}h{}m", hours, mins) + } else if secs >= 60 { + let mins = secs / 60; + let secs = secs % 60; + format!("{}m{}s", mins, secs) + } else { + format!("{}s", secs) + } +} diff --git a/crates/charmer-monitor/src/components/view_tabs.rs b/crates/charmer-monitor/src/components/view_tabs.rs new file mode 100644 index 0000000..10d4320 --- /dev/null +++ b/crates/charmer-monitor/src/components/view_tabs.rs @@ -0,0 +1,46 @@ +//! View tabs component - generates title with inline tab selection. + +use crate::app::ViewMode; +use ratatui::{ + style::{Color, Modifier, Style}, + text::{Line, Span}, +}; + +pub struct ViewTabs; + +impl ViewTabs { + /// Generate a title Line with inline tab selection. + /// Returns something like: " \[Jobs\] Rules " + pub fn title_line(view_mode: ViewMode) -> Line<'static> { + let tabs = [("Jobs", ViewMode::Jobs), ("Rules", ViewMode::Rules)]; + + let mut spans = Vec::new(); + spans.push(Span::raw(" ")); + + for (i, (name, mode)) in tabs.iter().enumerate() { + if i > 0 { + spans.push(Span::styled(" ", Style::default().fg(Color::DarkGray))); + } + + if *mode == view_mode { + // Selected tab - bold and highlighted + spans.push(Span::styled( + format!("[{}]", name), + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + )); + } else { + // Unselected tab - dimmed + spans.push(Span::styled( + name.to_string(), + Style::default().fg(Color::DarkGray), + )); + } + } + + spans.push(Span::raw(" ")); + + Line::from(spans) + } +} diff --git a/crates/charmer-parsers/Cargo.toml b/crates/charmer-parsers/Cargo.toml new file mode 100644 index 0000000..ecba5ab --- /dev/null +++ b/crates/charmer-parsers/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "charmer-parsers" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +authors.workspace = true +description = "Shared parsing utilities for scheduler output" + +[dependencies] +chrono.workspace = true +thiserror.workspace = true +tokio.workspace = true diff --git a/crates/charmer-parsers/src/command.rs b/crates/charmer-parsers/src/command.rs new file mode 100644 index 0000000..76fe2c2 --- /dev/null +++ b/crates/charmer-parsers/src/command.rs @@ -0,0 +1,69 @@ +//! Command execution utilities for scheduler queries. + +use thiserror::Error; +use tokio::process::Command; + +/// Error type for command execution. +#[derive(Error, Debug)] +pub enum CommandError { + #[error("Failed to execute {command}: {error}")] + Execution { command: String, error: String }, + #[error("Command {command} failed: {stderr}")] + Failed { command: String, stderr: String }, +} + +/// Execute a command and return stdout as a string. +/// +/// This is a convenience wrapper that handles common error cases +/// and UTF-8 conversion for scheduler command output. +pub async fn run_command(cmd: &mut Command, name: &str) -> Result { + let output = cmd.output().await.map_err(|e| CommandError::Execution { + command: name.to_string(), + error: e.to_string(), + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(CommandError::Failed { + command: name.to_string(), + stderr: stderr.to_string(), + }); + } + + Ok(String::from_utf8_lossy(&output.stdout).into_owned()) +} + +/// Execute a command and return stdout, treating non-zero exit as OK. +/// +/// Some commands (like bjobs with no jobs) return non-zero but are still valid. +pub async fn run_command_allow_failure( + cmd: &mut Command, + name: &str, +) -> Result { + let output = cmd.output().await.map_err(|e| CommandError::Execution { + command: name.to_string(), + error: e.to_string(), + })?; + + Ok(String::from_utf8_lossy(&output.stdout).into_owned()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_run_command_success() { + let mut cmd = Command::new("echo"); + cmd.arg("hello"); + let result = run_command(&mut cmd, "echo").await.unwrap(); + assert_eq!(result.trim(), "hello"); + } + + #[tokio::test] + async fn test_run_command_not_found() { + let mut cmd = Command::new("nonexistent_command_12345"); + let result = run_command(&mut cmd, "nonexistent").await; + assert!(matches!(result, Err(CommandError::Execution { .. }))); + } +} diff --git a/crates/charmer-parsers/src/lib.rs b/crates/charmer-parsers/src/lib.rs new file mode 100644 index 0000000..82a8cf7 --- /dev/null +++ b/crates/charmer-parsers/src/lib.rs @@ -0,0 +1,59 @@ +//! Shared parsing utilities for scheduler command output. +//! +//! This crate provides common parsing functions used by both +//! charmer-slurm and charmer-lsf to reduce code duplication. + +pub mod command; +pub mod memory; +pub mod time; + +pub use command::{run_command, run_command_allow_failure, CommandError}; +pub use memory::{parse_memory_mb, MemoryFormat}; +pub use time::{parse_duration, parse_exit_code, parse_lsf_timestamp, parse_slurm_timestamp}; + +/// Filter helper for optional string fields. +/// Returns None if the string is empty or a placeholder value. +pub fn non_empty_string(s: &str) -> Option { + let trimmed = s.trim(); + if trimmed.is_empty() || trimmed == "-" || trimmed == "N/A" || trimmed == "Unknown" { + None + } else { + Some(trimmed.to_string()) + } +} + +/// Split a pipe-delimited line and validate field count. +pub fn split_delimited(line: &str, min_fields: usize) -> Result, String> { + let fields: Vec<&str> = line.split('|').collect(); + if fields.len() < min_fields { + return Err(format!( + "Expected {} fields, got {}: {}", + min_fields, + fields.len(), + line + )); + } + Ok(fields) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_non_empty_string() { + assert_eq!(non_empty_string("hello"), Some("hello".to_string())); + assert_eq!(non_empty_string(" hello "), Some("hello".to_string())); + assert_eq!(non_empty_string(""), None); + assert_eq!(non_empty_string("-"), None); + assert_eq!(non_empty_string("N/A"), None); + assert_eq!(non_empty_string("Unknown"), None); + } + + #[test] + fn test_split_delimited() { + let line = "a|b|c|d"; + assert_eq!(split_delimited(line, 4).unwrap(), vec!["a", "b", "c", "d"]); + assert!(split_delimited(line, 5).is_err()); + } +} diff --git a/crates/charmer-parsers/src/memory.rs b/crates/charmer-parsers/src/memory.rs new file mode 100644 index 0000000..b932013 --- /dev/null +++ b/crates/charmer-parsers/src/memory.rs @@ -0,0 +1,108 @@ +//! Memory parsing utilities for scheduler output. + +/// Memory format variants for different schedulers. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MemoryFormat { + /// SLURM format: "4G", "1000M", "4096K" (no spaces) + Slurm, + /// SLURM sacct format: "4Gn", "1000Mc" (with per-node/per-core suffix) + SlurmSacct, + /// LSF format: "4 GB", "1000 MB" (with spaces) + Lsf, +} + +/// Parse memory string to megabytes. +/// +/// Handles various formats from SLURM and LSF: +/// - SLURM: "4G", "1000M", "4096K", "4096" (no spaces) +/// - SLURM sacct: "4Gn", "1000Mc" (n=per node, c=per core) +/// - LSF: "4 GB", "1000 MB" (with spaces) +/// +/// Returns None for empty strings or placeholder values. +pub fn parse_memory_mb(s: &str, format: MemoryFormat) -> Option { + if s.is_empty() || s == "-" { + return None; + } + + match format { + MemoryFormat::Slurm => parse_slurm_memory(s), + MemoryFormat::SlurmSacct => parse_slurm_sacct_memory(s), + MemoryFormat::Lsf => parse_lsf_memory(s), + } +} + +/// Parse SLURM squeue memory format (e.g., "4G", "1000M", "4096"). +fn parse_slurm_memory(s: &str) -> Option { + let s = s.trim(); + + if let Some(stripped) = s.strip_suffix('G') { + stripped.parse::().ok().map(|v| v * 1024) + } else if let Some(stripped) = s.strip_suffix('M') { + stripped.parse::().ok() + } else if let Some(stripped) = s.strip_suffix('K') { + stripped.parse::().ok().map(|v| v / 1024) + } else { + // Assume MB if no suffix + s.parse::().ok() + } +} + +/// Parse SLURM sacct memory format (e.g., "4Gn", "1000Mc"). +fn parse_slurm_sacct_memory(s: &str) -> Option { + // sacct memory can have 'n' or 'c' suffix (per node/per core) + let s = s.trim().trim_end_matches('n').trim_end_matches('c'); + parse_slurm_memory(s) +} + +/// Parse LSF memory format (e.g., "4 GB", "1000 MB"). +fn parse_lsf_memory(s: &str) -> Option { + let parts: Vec<&str> = s.split_whitespace().collect(); + if parts.is_empty() { + return None; + } + + let value: f64 = parts[0].parse().ok()?; + let unit = parts.get(1).map(|s| s.to_uppercase()).unwrap_or_default(); + + match unit.as_str() { + "GB" | "G" => Some((value * 1024.0) as u64), + "MB" | "M" | "" => Some(value as u64), + "KB" | "K" => Some((value / 1024.0) as u64), + _ => Some(value as u64), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_slurm_memory() { + assert_eq!(parse_memory_mb("4G", MemoryFormat::Slurm), Some(4096)); + assert_eq!(parse_memory_mb("1000M", MemoryFormat::Slurm), Some(1000)); + assert_eq!(parse_memory_mb("4096K", MemoryFormat::Slurm), Some(4)); + assert_eq!(parse_memory_mb("4096", MemoryFormat::Slurm), Some(4096)); + assert_eq!(parse_memory_mb("", MemoryFormat::Slurm), None); + } + + #[test] + fn test_parse_slurm_sacct_memory() { + assert_eq!(parse_memory_mb("4Gn", MemoryFormat::SlurmSacct), Some(4096)); + assert_eq!( + parse_memory_mb("1000Mc", MemoryFormat::SlurmSacct), + Some(1000) + ); + assert_eq!( + parse_memory_mb("4096", MemoryFormat::SlurmSacct), + Some(4096) + ); + } + + #[test] + fn test_parse_lsf_memory() { + assert_eq!(parse_memory_mb("4 GB", MemoryFormat::Lsf), Some(4096)); + assert_eq!(parse_memory_mb("1000 MB", MemoryFormat::Lsf), Some(1000)); + assert_eq!(parse_memory_mb("1000", MemoryFormat::Lsf), Some(1000)); + assert_eq!(parse_memory_mb("-", MemoryFormat::Lsf), None); + } +} diff --git a/crates/charmer-parsers/src/time.rs b/crates/charmer-parsers/src/time.rs new file mode 100644 index 0000000..54605b5 --- /dev/null +++ b/crates/charmer-parsers/src/time.rs @@ -0,0 +1,139 @@ +//! Time parsing utilities for scheduler output. + +use chrono::{DateTime, Datelike, NaiveDateTime, TimeZone, Utc}; +use std::time::Duration; + +/// Parse a SLURM timestamp (YYYY-MM-DDTHH:MM:SS or placeholder values). +/// +/// Returns None for empty strings or placeholder values like "N/A", "Unknown", "None". +pub fn parse_slurm_timestamp(s: &str) -> Option> { + if s.is_empty() || s == "N/A" || s == "Unknown" || s == "None" { + return None; + } + NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") + .ok() + .and_then(|dt| Utc.from_local_datetime(&dt).single()) +} + +/// Parse an LSF timestamp format (Mon DD HH:MM or Mon DD HH:MM YYYY). +/// +/// Returns None for empty strings or "-" placeholder. +pub fn parse_lsf_timestamp(s: &str) -> Option> { + if s.is_empty() || s == "-" { + return None; + } + + let current_year = Utc::now().year(); + + // Try with year first (e.g., "Dec 18 10:30 2024") + if let Ok(dt) = NaiveDateTime::parse_from_str(s, "%b %d %H:%M %Y") { + return Utc.from_local_datetime(&dt).single(); + } + + // Try without year, assume current year (e.g., "Dec 18 10:30") + if let Ok(dt) = + NaiveDateTime::parse_from_str(&format!("{} {}", s, current_year), "%b %d %H:%M %Y") + { + return Utc.from_local_datetime(&dt).single(); + } + + None +} + +/// Parse a duration in various formats. +/// +/// Supports: +/// - D-HH:MM:SS (SLURM time limit with days) +/// - HH:MM:SS +/// - MM:SS +/// - Seconds as integer +/// +/// Returns None for "UNLIMITED" or empty strings. +pub fn parse_duration(s: &str) -> Option { + if s.is_empty() || s == "UNLIMITED" || s == "-" { + return None; + } + + // Check for day separator (D-HH:MM:SS) + let parts: Vec<&str> = s.split('-').collect(); + let (days, time_part) = if parts.len() == 2 { + (parts[0].parse::().unwrap_or(0), parts[1]) + } else { + (0, parts[0]) + }; + + let time_parts: Vec = time_part + .split(':') + .filter_map(|p| p.parse().ok()) + .collect(); + + let seconds = match time_parts.len() { + 3 => time_parts[0] * 3600 + time_parts[1] * 60 + time_parts[2], + 2 => time_parts[0] * 60 + time_parts[1], + 1 => time_parts[0], + _ => return None, + }; + + Some(Duration::from_secs(days * 86400 + seconds)) +} + +/// Parse exit code from SLURM format (exit_code:signal). +/// +/// Returns the exit code portion, defaulting to 0 if parsing fails. +pub fn parse_exit_code(s: &str) -> i32 { + s.split(':') + .next() + .and_then(|v| v.parse().ok()) + .unwrap_or(0) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_slurm_timestamp() { + let dt = parse_slurm_timestamp("2024-01-15T10:30:00").unwrap(); + assert_eq!(dt.format("%Y-%m-%d").to_string(), "2024-01-15"); + + assert!(parse_slurm_timestamp("N/A").is_none()); + assert!(parse_slurm_timestamp("Unknown").is_none()); + assert!(parse_slurm_timestamp("None").is_none()); + assert!(parse_slurm_timestamp("").is_none()); + } + + #[test] + fn test_parse_lsf_timestamp() { + // With year + let dt = parse_lsf_timestamp("Dec 18 10:30 2024").unwrap(); + assert_eq!(dt.format("%Y-%m-%d").to_string(), "2024-12-18"); + + // Without year (uses current year) + let dt = parse_lsf_timestamp("Dec 18 10:30"); + assert!(dt.is_some()); + + assert!(parse_lsf_timestamp("-").is_none()); + assert!(parse_lsf_timestamp("").is_none()); + } + + #[test] + fn test_parse_duration() { + assert_eq!(parse_duration("1:00:00"), Some(Duration::from_secs(3600))); + assert_eq!( + parse_duration("1-00:00:00"), + Some(Duration::from_secs(86400)) + ); + assert_eq!(parse_duration("30:00"), Some(Duration::from_secs(1800))); + assert_eq!(parse_duration("3600"), Some(Duration::from_secs(3600))); + assert!(parse_duration("UNLIMITED").is_none()); + assert!(parse_duration("-").is_none()); + } + + #[test] + fn test_parse_exit_code() { + assert_eq!(parse_exit_code("0:0"), 0); + assert_eq!(parse_exit_code("1:0"), 1); + assert_eq!(parse_exit_code("137:9"), 137); + assert_eq!(parse_exit_code(""), 0); + } +} diff --git a/crates/charmer-slurm/Cargo.toml b/crates/charmer-slurm/Cargo.toml index 5178096..e4b9c5f 100644 --- a/crates/charmer-slurm/Cargo.toml +++ b/crates/charmer-slurm/Cargo.toml @@ -5,6 +5,7 @@ edition.workspace = true license.workspace = true [dependencies] +charmer-parsers.workspace = true tokio.workspace = true regex.workspace = true thiserror.workspace = true diff --git a/crates/charmer-slurm/src/failure.rs b/crates/charmer-slurm/src/failure.rs new file mode 100644 index 0000000..d103dee --- /dev/null +++ b/crates/charmer-slurm/src/failure.rs @@ -0,0 +1,398 @@ +//! SLURM job failure analysis. +//! +//! Query detailed failure information and provide actionable suggestions. + +use charmer_parsers::{parse_memory_mb, run_command_allow_failure, MemoryFormat}; +use thiserror::Error; +use tokio::process::Command; + +#[derive(Error, Debug)] +pub enum FailureError { + #[error("Failed to execute sacct: {0}")] + ExecutionError(String), + #[error("Job not found: {0}")] + NotFound(String), + #[error("Parse error: {0}")] + ParseError(String), +} + +/// Failure mode classification. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FailureMode { + /// Job ran out of memory + OutOfMemory { + used_mb: u64, + requested_mb: u64, + suggested_mb: u64, + }, + /// Job exceeded time limit + Timeout { + elapsed_seconds: u64, + limit_seconds: u64, + suggested_seconds: u64, + }, + /// Job failed with non-zero exit code + ExitCode { code: i32, signal: Option }, + /// Job was cancelled by user or admin + Cancelled { by_user: Option }, + /// Node failure + NodeFailure { node: Option }, + /// Unknown failure mode + Unknown { state: String }, +} + +/// Detailed failure analysis result. +#[derive(Debug, Clone)] +pub struct FailureAnalysis { + /// SLURM job ID + pub job_id: String, + /// Classified failure mode + pub mode: FailureMode, + /// Human-readable explanation + pub explanation: String, + /// Suggested fix + pub suggestion: String, + /// Raw SLURM state string + pub raw_state: String, + /// Actual memory used (MB) + pub max_rss_mb: Option, + /// Requested memory (MB) + pub req_mem_mb: Option, + /// Actual runtime (seconds) + pub elapsed_seconds: Option, + /// Time limit (seconds) + pub time_limit_seconds: Option, +} + +impl FailureAnalysis { + /// Generate explanation and suggestion based on failure mode. + fn generate_messages(mode: &FailureMode) -> (String, String) { + match mode { + FailureMode::OutOfMemory { + used_mb, + requested_mb, + suggested_mb, + } => { + let explanation = format!( + "Job exceeded memory limit. Used {:.1} GB but only {:.1} GB was allocated.", + *used_mb as f64 / 1024.0, + *requested_mb as f64 / 1024.0 + ); + let suggestion = format!( + "Increase memory to at least {:.1} GB. In your Snakefile, add:\n resources: mem_mb={}", + *suggested_mb as f64 / 1024.0, + suggested_mb + ); + (explanation, suggestion) + } + FailureMode::Timeout { + elapsed_seconds, + limit_seconds, + suggested_seconds, + } => { + let explanation = format!( + "Job exceeded time limit. Ran for {} but limit was {}.", + format_duration(*elapsed_seconds), + format_duration(*limit_seconds) + ); + let suggestion = format!( + "Increase time limit to at least {}. In your Snakefile, add:\n resources: runtime=\"{}\"", + format_duration(*suggested_seconds), + format_duration_slurm(*suggested_seconds) + ); + (explanation, suggestion) + } + FailureMode::ExitCode { code, signal } => { + let explanation = if let Some(sig) = signal { + match sig { + 9 => format!("Job killed with signal {} (SIGKILL). Exit code: {}", sig, code), + 11 => format!("Job crashed with signal {} (SIGSEGV - segmentation fault). Exit code: {}", sig, code), + 15 => format!("Job terminated with signal {} (SIGTERM). Exit code: {}", sig, code), + _ => format!("Job exited with code {} and signal {}", code, sig), + } + } else { + match code { + 1 => "Job failed with exit code 1 (general error)".to_string(), + 2 => "Job failed with exit code 2 (misuse of shell command)".to_string(), + 126 => "Job failed with exit code 126 (command not executable)".to_string(), + 127 => "Job failed with exit code 127 (command not found)".to_string(), + 137 => "Job killed (likely OOM killer). Exit code 137 = 128 + 9 (SIGKILL)" + .to_string(), + _ => format!("Job failed with exit code {}", code), + } + }; + let suggestion = if *code == 137 { + "This is likely an out-of-memory error. Try increasing memory allocation." + .to_string() + } else { + "Check the job's stderr log for error details.".to_string() + }; + (explanation, suggestion) + } + FailureMode::Cancelled { by_user } => { + let explanation = if let Some(user) = by_user { + format!("Job was cancelled by {}", user) + } else { + "Job was cancelled".to_string() + }; + ( + "Consider if this was intentional or due to dependency failure.".to_string(), + explanation, + ) + } + FailureMode::NodeFailure { node } => { + let explanation = if let Some(n) = node { + format!("Job failed due to node {} failure", n) + } else { + "Job failed due to node failure".to_string() + }; + ( + "Re-run the job. If persistent, contact cluster admin.".to_string(), + explanation, + ) + } + FailureMode::Unknown { state } => ( + format!("Job failed with unknown state: {}", state), + "Check SLURM logs for details.".to_string(), + ), + } + } +} + +/// Query detailed failure information for a SLURM job. +pub async fn analyze_failure(job_id: &str) -> Result { + // Query sacct with detailed memory and time info + // Format: State, ExitCode, MaxRSS, ReqMem, Elapsed, Timelimit, NodeList + let mut cmd = Command::new("sacct"); + cmd.args([ + "-j", + job_id, + "-X", + "--parsable2", + "--noheader", + "--format", + "State,ExitCode,MaxRSS,ReqMem,Elapsed,Timelimit,NodeList", + ]); + + let stdout = run_command_allow_failure(&mut cmd, "sacct") + .await + .map_err(|e| FailureError::ExecutionError(e.to_string()))?; + + let line = stdout + .lines() + .next() + .ok_or_else(|| FailureError::NotFound(job_id.to_string()))?; + + parse_failure_line(job_id, line) +} + +/// Parse sacct output line for failure analysis. +fn parse_failure_line(job_id: &str, line: &str) -> Result { + let fields: Vec<&str> = line.split('|').collect(); + if fields.len() < 7 { + return Err(FailureError::ParseError(format!( + "Expected 7 fields, got {}: {}", + fields.len(), + line + ))); + } + + let raw_state = fields[0].to_string(); + let exit_code_str = fields[1]; + let max_rss_str = fields[2]; + let req_mem_str = fields[3]; + let elapsed_str = fields[4]; + let time_limit_str = fields[5]; + let node = if fields[6].is_empty() || fields[6] == "None" { + None + } else { + Some(fields[6].to_string()) + }; + + // Parse exit code (format: "exit_code:signal") + let (exit_code, signal) = parse_exit_code_signal(exit_code_str); + + // Parse memory values + let max_rss_mb = parse_memory_mb(max_rss_str, MemoryFormat::SlurmSacct); + let req_mem_mb = parse_memory_mb(req_mem_str, MemoryFormat::SlurmSacct); + + // Parse time values + let elapsed_seconds = parse_elapsed(elapsed_str); + let time_limit_seconds = parse_elapsed(time_limit_str); + + // Determine failure mode + let base_state = raw_state.split_whitespace().next().unwrap_or(&raw_state); + let mode = match base_state.to_uppercase().as_str() { + "OUT_OF_MEMORY" => { + let used = max_rss_mb.unwrap_or(0); + let requested = req_mem_mb.unwrap_or(0); + // Suggest 50% more than used, rounded up to nearest GB + let suggested = ((used as f64 * 1.5) / 1024.0).ceil() as u64 * 1024; + FailureMode::OutOfMemory { + used_mb: used, + requested_mb: requested, + suggested_mb: suggested.max(requested + 1024), + } + } + "TIMEOUT" => { + let elapsed = elapsed_seconds.unwrap_or(0); + let limit = time_limit_seconds.unwrap_or(0); + // Suggest 50% more time + let suggested = (elapsed as f64 * 1.5) as u64; + FailureMode::Timeout { + elapsed_seconds: elapsed, + limit_seconds: limit, + suggested_seconds: suggested.max(limit + 3600), + } + } + "CANCELLED" => { + // Check if cancelled by someone + let by_user = if raw_state.contains("by ") { + raw_state.split("by ").nth(1).map(|s| s.trim().to_string()) + } else { + None + }; + FailureMode::Cancelled { by_user } + } + "NODE_FAIL" => FailureMode::NodeFailure { node }, + "FAILED" | "BOOT_FAIL" | "DEADLINE" => { + // Check for common exit codes that indicate OOM + if exit_code == 137 || (signal == Some(9) && max_rss_mb.is_some()) { + let used = max_rss_mb.unwrap_or(0); + let requested = req_mem_mb.unwrap_or(0); + let suggested = ((used as f64 * 1.5) / 1024.0).ceil() as u64 * 1024; + FailureMode::OutOfMemory { + used_mb: used, + requested_mb: requested, + suggested_mb: suggested.max(requested + 1024), + } + } else { + FailureMode::ExitCode { + code: exit_code, + signal, + } + } + } + other => FailureMode::Unknown { + state: other.to_string(), + }, + }; + + let (explanation, suggestion) = FailureAnalysis::generate_messages(&mode); + + Ok(FailureAnalysis { + job_id: job_id.to_string(), + mode, + explanation, + suggestion, + raw_state, + max_rss_mb, + req_mem_mb, + elapsed_seconds, + time_limit_seconds, + }) +} + +/// Parse exit code string "code:signal" into (code, signal). +fn parse_exit_code_signal(s: &str) -> (i32, Option) { + let parts: Vec<&str> = s.split(':').collect(); + let code = parts.first().and_then(|p| p.parse().ok()).unwrap_or(0); + let signal = parts + .get(1) + .and_then(|p| p.parse().ok()) + .filter(|&s| s != 0); + (code, signal) +} + +/// Parse elapsed time string (HH:MM:SS or D-HH:MM:SS) to seconds. +fn parse_elapsed(s: &str) -> Option { + if s.is_empty() || s == "Unknown" { + return None; + } + + // Handle D-HH:MM:SS format + let (days, time_part) = if s.contains('-') { + let parts: Vec<&str> = s.splitn(2, '-').collect(); + let days: u64 = parts[0].parse().ok()?; + (days, parts.get(1).copied().unwrap_or("0:0:0")) + } else { + (0, s) + }; + + // Handle HH:MM:SS or MM:SS + let time_parts: Vec<&str> = time_part.split(':').collect(); + let (hours, mins, secs) = match time_parts.len() { + 3 => ( + time_parts[0].parse::().ok()?, + time_parts[1].parse::().ok()?, + time_parts[2].parse::().ok()?, + ), + 2 => ( + 0, + time_parts[0].parse::().ok()?, + time_parts[1].parse::().ok()?, + ), + 1 => (0, 0, time_parts[0].parse::().ok()?), + _ => return None, + }; + + Some(days * 86400 + hours * 3600 + mins * 60 + secs) +} + +/// Format seconds as human-readable duration. +fn format_duration(seconds: u64) -> String { + let hours = seconds / 3600; + let mins = (seconds % 3600) / 60; + let secs = seconds % 60; + + if hours > 24 { + let days = hours / 24; + let hours = hours % 24; + format!("{}d {:02}:{:02}:{:02}", days, hours, mins, secs) + } else if hours > 0 { + format!("{:02}:{:02}:{:02}", hours, mins, secs) + } else { + format!("{:02}:{:02}", mins, secs) + } +} + +/// Format seconds as SLURM duration format (D-HH:MM:SS). +fn format_duration_slurm(seconds: u64) -> String { + let days = seconds / 86400; + let hours = (seconds % 86400) / 3600; + let mins = (seconds % 3600) / 60; + let secs = seconds % 60; + + if days > 0 { + format!("{}-{:02}:{:02}:{:02}", days, hours, mins, secs) + } else { + format!("{:02}:{:02}:{:02}", hours, mins, secs) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_elapsed() { + assert_eq!(parse_elapsed("00:05:30"), Some(330)); + assert_eq!(parse_elapsed("01:30:00"), Some(5400)); + assert_eq!(parse_elapsed("1-12:00:00"), Some(129600)); + assert_eq!(parse_elapsed("30"), Some(30)); + } + + #[test] + fn test_parse_exit_code_signal() { + assert_eq!(parse_exit_code_signal("0:0"), (0, None)); + assert_eq!(parse_exit_code_signal("1:0"), (1, None)); + assert_eq!(parse_exit_code_signal("137:9"), (137, Some(9))); + } + + #[test] + fn test_format_duration() { + assert_eq!(format_duration(330), "05:30"); + assert_eq!(format_duration(3661), "01:01:01"); + assert_eq!(format_duration(90061), "1d 01:01:01"); + } +} diff --git a/crates/charmer-slurm/src/lib.rs b/crates/charmer-slurm/src/lib.rs index c250968..8e17812 100644 --- a/crates/charmer-slurm/src/lib.rs +++ b/crates/charmer-slurm/src/lib.rs @@ -2,8 +2,12 @@ //! //! Query job status via squeue and sacct. -pub mod types; -pub mod squeue; +pub mod failure; pub mod sacct; +pub mod squeue; +pub mod types; +pub use failure::{analyze_failure, FailureAnalysis, FailureError, FailureMode}; +pub use sacct::{query_resource_usage, query_sacct, SacctError, SlurmResourceUsage}; +pub use squeue::{query_squeue, SqueueError}; pub use types::{SlurmJob, SlurmJobState}; diff --git a/crates/charmer-slurm/src/sacct.rs b/crates/charmer-slurm/src/sacct.rs index 22e9a2c..6f87e98 100644 --- a/crates/charmer-slurm/src/sacct.rs +++ b/crates/charmer-slurm/src/sacct.rs @@ -1,8 +1,14 @@ //! Query SLURM job history via sacct. use crate::types::{SlurmJob, SlurmJobState}; +use charmer_parsers::{ + non_empty_string, parse_duration, parse_exit_code, parse_memory_mb, parse_slurm_timestamp, + run_command, split_delimited, MemoryFormat, +}; use chrono::{DateTime, Utc}; +use std::time::Duration; use thiserror::Error; +use tokio::process::Command; #[derive(Error, Debug)] pub enum SacctError { @@ -12,12 +18,216 @@ pub enum SacctError { ParseError(String), } +/// sacct output format (--parsable2 uses | delimiter) +/// JobIDRaw, JobName, State, Partition, Submit, Start, End, NodeList, AllocCPUS, ReqMem, Timelimit, Comment, ExitCode +const SACCT_FORMAT: &str = + "JobIDRaw,JobName,State,Partition,Submit,Start,End,NodeList,AllocCPUS,ReqMem,Timelimit,Comment,ExitCode"; + +/// Parse sacct state string with exit code info. +fn parse_state(state_str: &str, exit_code_str: &str) -> SlurmJobState { + let exit_code = parse_exit_code(exit_code_str); + + // sacct states can have suffixes like "CANCELLED by 12345" + let base_state = state_str.split_whitespace().next().unwrap_or(state_str); + + match base_state.to_uppercase().as_str() { + "PENDING" => SlurmJobState::Pending, + "RUNNING" => SlurmJobState::Running, + "COMPLETED" => SlurmJobState::Completed { + exit_code, + runtime: Duration::ZERO, // Would need to calculate from start/end + }, + "FAILED" => SlurmJobState::Failed { + exit_code, + error: format!("Exit code: {}", exit_code), + }, + "CANCELLED" => SlurmJobState::Cancelled, + "TIMEOUT" => SlurmJobState::Timeout, + "OUT_OF_MEMORY" => SlurmJobState::OutOfMemory, + "NODE_FAIL" => SlurmJobState::Failed { + exit_code: -1, + error: "Node failure".to_string(), + }, + other => SlurmJobState::Unknown(other.to_string()), + } +} + +/// Parse a single line of sacct output. +fn parse_sacct_line(line: &str) -> Result { + let fields = split_delimited(line, 13).map_err(SacctError::ParseError)?; + + let state = parse_state(fields[2], fields[12]); + + Ok(SlurmJob { + job_id: fields[0].to_string(), + name: fields[1].to_string(), + state, + partition: non_empty_string(fields[3]), + submit_time: parse_slurm_timestamp(fields[4]), + start_time: parse_slurm_timestamp(fields[5]), + end_time: parse_slurm_timestamp(fields[6]), + nodelist: non_empty_string(fields[7]), + cpus: fields[8].parse().ok(), + mem_mb: parse_memory_mb(fields[9], MemoryFormat::SlurmSacct), + time_limit: parse_duration(fields[10]), + comment: non_empty_string(fields[11]), + }) +} + +/// Resource usage data from sacct. +#[derive(Debug, Clone)] +pub struct SlurmResourceUsage { + pub job_id: String, + pub max_rss_mb: Option, + pub elapsed_seconds: Option, + pub cpu_time_seconds: Option, +} + +/// Query resource usage for a specific job. +pub async fn query_resource_usage(job_id: &str) -> Result, SacctError> { + let mut cmd = Command::new("sacct"); + cmd.args([ + "-j", + job_id, + "-X", + "--parsable2", + "--noheader", + "--format", + "JobIDRaw,MaxRSS,Elapsed,TotalCPU", + ]); + + let stdout = run_command(&mut cmd, "sacct") + .await + .map_err(|e| SacctError::ExecutionError(e.to_string()))?; + + let line = match stdout.lines().next() { + Some(l) if !l.trim().is_empty() => l, + _ => return Ok(None), + }; + + let fields: Vec<&str> = line.split('|').collect(); + if fields.len() < 4 { + return Ok(None); + } + + Ok(Some(SlurmResourceUsage { + job_id: fields[0].to_string(), + max_rss_mb: parse_memory_mb(fields[1], MemoryFormat::SlurmSacct), + elapsed_seconds: parse_elapsed_time(fields[2]), + cpu_time_seconds: parse_elapsed_time(fields[3]), + })) +} + +/// Parse elapsed time string (HH:MM:SS or D-HH:MM:SS) to seconds. +fn parse_elapsed_time(s: &str) -> Option { + if s.is_empty() || s == "Unknown" { + return None; + } + + // Handle D-HH:MM:SS format + let (days, time_part) = if s.contains('-') { + let parts: Vec<&str> = s.splitn(2, '-').collect(); + let days: u64 = parts[0].parse().ok()?; + (days, parts.get(1).copied().unwrap_or("0:0:0")) + } else { + (0, s) + }; + + // Handle HH:MM:SS or MM:SS or SS.mmm + let time_part = time_part.split('.').next().unwrap_or(time_part); // Remove milliseconds + let time_parts: Vec<&str> = time_part.split(':').collect(); + let (hours, mins, secs) = match time_parts.len() { + 3 => ( + time_parts[0].parse::().ok()?, + time_parts[1].parse::().ok()?, + time_parts[2].parse::().ok()?, + ), + 2 => ( + 0, + time_parts[0].parse::().ok()?, + time_parts[1].parse::().ok()?, + ), + 1 => (0, 0, time_parts[0].parse::().ok()?), + _ => return None, + }; + + Some(days * 86400 + hours * 3600 + mins * 60 + secs) +} + /// Query job history with sacct. pub async fn query_sacct( - _run_uuid: Option<&str>, - _since: Option>, + run_uuid: Option<&str>, + since: Option>, ) -> Result, SacctError> { - // TODO: Implement sacct parsing - // sacct -X --parsable2 --noheader --format=JobIDRaw,JobName,State,... - Ok(vec![]) + let mut cmd = Command::new("sacct"); + cmd.args(["-X", "--parsable2", "--noheader", "--format", SACCT_FORMAT]); + + // Add time filter + if let Some(since_time) = since { + let time_str = since_time.format("%Y-%m-%dT%H:%M:%S").to_string(); + cmd.args(["--starttime", &time_str]); + } else { + // Default to last 24 hours + cmd.args(["--starttime", "now-24hours"]); + } + + // Filter by job name if run_uuid specified + if let Some(uuid) = run_uuid { + cmd.args(["--name", uuid]); + } + + let stdout = run_command(&mut cmd, "sacct") + .await + .map_err(|e| SacctError::ExecutionError(e.to_string()))?; + + let mut jobs = Vec::new(); + + for line in stdout.lines() { + if line.trim().is_empty() { + continue; + } + match parse_sacct_line(line) { + Ok(job) => jobs.push(job), + Err(e) => eprintln!("Warning: {}", e), + } + } + + Ok(jobs) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_exit_code() { + assert_eq!(parse_exit_code("0:0"), 0); + assert_eq!(parse_exit_code("1:0"), 1); + assert_eq!(parse_exit_code("137:9"), 137); + } + + #[test] + fn test_parse_state() { + assert!(matches!( + parse_state("COMPLETED", "0:0"), + SlurmJobState::Completed { exit_code: 0, .. } + )); + assert!(matches!( + parse_state("FAILED", "1:0"), + SlurmJobState::Failed { exit_code: 1, .. } + )); + assert_eq!( + parse_state("CANCELLED by 12345", "0:0"), + SlurmJobState::Cancelled + ); + } + + #[test] + fn test_parse_sacct_line() { + let line = "12345|test_job|COMPLETED|short|2024-01-15T10:00:00|2024-01-15T10:05:00|2024-01-15T10:10:00|node01|4|4Gn|1:00:00|rule_align_wildcards_sample=S1|0:0"; + let job = parse_sacct_line(line).unwrap(); + assert_eq!(job.job_id, "12345"); + assert_eq!(job.name, "test_job"); + assert!(matches!(job.state, SlurmJobState::Completed { .. })); + } } diff --git a/crates/charmer-slurm/src/squeue.rs b/crates/charmer-slurm/src/squeue.rs index 1a37399..0e47735 100644 --- a/crates/charmer-slurm/src/squeue.rs +++ b/crates/charmer-slurm/src/squeue.rs @@ -1,7 +1,13 @@ //! Query active SLURM jobs via squeue. use crate::types::{SlurmJob, SlurmJobState}; +use charmer_parsers::{ + non_empty_string, parse_duration, parse_memory_mb, parse_slurm_timestamp, run_command, + split_delimited, MemoryFormat, +}; +use std::time::Duration; use thiserror::Error; +use tokio::process::Command; #[derive(Error, Debug)] pub enum SqueueError { @@ -11,9 +17,142 @@ pub enum SqueueError { ParseError(String), } +/// squeue output format: +/// %A - Job ID +/// %j - Job name +/// %T - State (extended) +/// %P - Partition +/// %V - Submit time +/// %S - Start time +/// %e - End time (estimated) +/// %N - Nodelist +/// %C - CPUs +/// %m - Memory +/// %l - Time limit +/// %k - Comment +const SQUEUE_FORMAT: &str = "%A|%j|%T|%P|%V|%S|%e|%N|%C|%m|%l|%k"; + +/// Parse SLURM state string. +fn parse_state(s: &str) -> SlurmJobState { + match s.to_uppercase().as_str() { + "PENDING" | "PD" => SlurmJobState::Pending, + "RUNNING" | "R" => SlurmJobState::Running, + "COMPLETED" | "CD" => SlurmJobState::Completed { + exit_code: 0, + runtime: Duration::ZERO, + }, + "FAILED" | "F" => SlurmJobState::Failed { + exit_code: 1, + error: String::new(), + }, + "CANCELLED" | "CA" => SlurmJobState::Cancelled, + "TIMEOUT" | "TO" => SlurmJobState::Timeout, + "OUT_OF_MEMORY" | "OOM" => SlurmJobState::OutOfMemory, + other => SlurmJobState::Unknown(other.to_string()), + } +} + +/// Parse a single line of squeue output. +fn parse_squeue_line(line: &str) -> Result { + let fields = split_delimited(line, 12).map_err(SqueueError::ParseError)?; + + Ok(SlurmJob { + job_id: fields[0].to_string(), + name: fields[1].to_string(), + state: parse_state(fields[2]), + partition: non_empty_string(fields[3]), + submit_time: parse_slurm_timestamp(fields[4]), + start_time: parse_slurm_timestamp(fields[5]), + end_time: parse_slurm_timestamp(fields[6]), + nodelist: non_empty_string(fields[7]), + cpus: fields[8].parse().ok(), + mem_mb: parse_memory_mb(fields[9], MemoryFormat::Slurm), + time_limit: parse_duration(fields[10]), + comment: non_empty_string(fields[11]), + }) +} + /// Query active jobs with squeue. -pub async fn query_squeue(_run_uuid: Option<&str>) -> Result, SqueueError> { - // TODO: Implement squeue parsing - // squeue -u $USER -h -o "%A|%j|%T|%P|%V|%S|%e|%N|%C|%m|%l|%k" - Ok(vec![]) +pub async fn query_squeue(run_uuid: Option<&str>) -> Result, SqueueError> { + let user = std::env::var("USER").unwrap_or_default(); + + let mut cmd = Command::new("squeue"); + cmd.args(["-u", &user, "-h", "-o", SQUEUE_FORMAT]); + + // If run_uuid specified, filter by job name + if let Some(uuid) = run_uuid { + cmd.args(["--name", uuid]); + } + + let stdout = run_command(&mut cmd, "squeue") + .await + .map_err(|e| SqueueError::ExecutionError(e.to_string()))?; + + let mut jobs = Vec::new(); + + for line in stdout.lines() { + if line.trim().is_empty() { + continue; + } + match parse_squeue_line(line) { + Ok(job) => jobs.push(job), + Err(e) => eprintln!("Warning: {}", e), + } + } + + Ok(jobs) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_slurm_time() { + let dt = parse_slurm_timestamp("2024-01-15T10:30:00").unwrap(); + assert_eq!(dt.format("%Y-%m-%d").to_string(), "2024-01-15"); + + assert!(parse_slurm_timestamp("N/A").is_none()); + assert!(parse_slurm_timestamp("").is_none()); + } + + #[test] + fn test_parse_time_limit() { + assert_eq!(parse_duration("1:00:00"), Some(Duration::from_secs(3600))); + assert_eq!( + parse_duration("1-00:00:00"), + Some(Duration::from_secs(86400)) + ); + assert_eq!(parse_duration("30:00"), Some(Duration::from_secs(1800))); + assert!(parse_duration("UNLIMITED").is_none()); + } + + #[test] + fn test_parse_memory() { + assert_eq!(parse_memory_mb("4G", MemoryFormat::Slurm), Some(4096)); + assert_eq!(parse_memory_mb("1000M", MemoryFormat::Slurm), Some(1000)); + assert_eq!(parse_memory_mb("4096", MemoryFormat::Slurm), Some(4096)); + } + + #[test] + fn test_parse_state() { + assert_eq!(parse_state("RUNNING"), SlurmJobState::Running); + assert_eq!(parse_state("R"), SlurmJobState::Running); + assert_eq!(parse_state("PENDING"), SlurmJobState::Pending); + assert_eq!(parse_state("PD"), SlurmJobState::Pending); + } + + #[test] + fn test_parse_squeue_line() { + let line = "12345|test_job|RUNNING|short|2024-01-15T10:00:00|2024-01-15T10:05:00|N/A|node01|4|4G|1:00:00|rule_align_wildcards_sample=S1"; + let job = parse_squeue_line(line).unwrap(); + assert_eq!(job.job_id, "12345"); + assert_eq!(job.name, "test_job"); + assert_eq!(job.state, SlurmJobState::Running); + assert_eq!(job.cpus, Some(4)); + assert_eq!( + job.comment, + Some("rule_align_wildcards_sample=S1".to_string()) + ); + } } diff --git a/crates/charmer-state/Cargo.toml b/crates/charmer-state/Cargo.toml index cc344f0..19e0803 100644 --- a/crates/charmer-state/Cargo.toml +++ b/crates/charmer-state/Cargo.toml @@ -7,8 +7,10 @@ license.workspace = true [dependencies] charmer-core.workspace = true charmer-slurm.workspace = true +charmer-lsf.workspace = true serde.workspace = true serde_json.workspace = true chrono.workspace = true camino.workspace = true thiserror.workspace = true +regex.workspace = true diff --git a/crates/charmer-state/src/lib.rs b/crates/charmer-state/src/lib.rs index bc687cc..d347519 100644 --- a/crates/charmer-state/src/lib.rs +++ b/crates/charmer-state/src/lib.rs @@ -5,7 +5,12 @@ pub mod merge; pub mod types; -pub use merge::{correlate_jobs, merge_slurm_jobs, merge_snakemake_jobs, parse_slurm_comment}; +pub use merge::{ + correlate_jobs, merge_lsf_jobs, merge_slurm_jobs, merge_snakemake_jobs, parse_lsf_description, + parse_slurm_comment, +}; pub use types::{ - DataSources, Job, JobCounts, JobError, JobResources, JobStatus, JobTiming, PipelineState, + DataSources, EnvType, ExecutionEnvironment, FailureAnalysis, FailureMode, Job, JobCounts, + JobError, JobResources, JobStatus, JobTiming, PipelineError, PipelineErrorType, PipelineState, + ResourceUsage, MAIN_PIPELINE_JOB_ID, }; diff --git a/crates/charmer-state/src/merge.rs b/crates/charmer-state/src/merge.rs deleted file mode 100644 index 2d76807..0000000 --- a/crates/charmer-state/src/merge.rs +++ /dev/null @@ -1,305 +0,0 @@ -//! Merge SLURM and snakemake data into unified state. - -use crate::types::{DataSources, Job, JobError, JobResources, JobStatus, JobTiming, PipelineState}; -use charmer_core::SnakemakeJob; -use charmer_slurm::{SlurmJob, SlurmJobState}; -use chrono::{DateTime, TimeZone, Utc}; - -/// Parse snakemake SLURM comment field: "rule_{rulename}_wildcards_{wildcards}" -pub fn parse_slurm_comment(comment: &str) -> Option<(String, Option)> { - // Format: "rule_RULENAME_wildcards_WILDCARDS" or just "rule_RULENAME" - if !comment.starts_with("rule_") { - return None; - } - - let rest = &comment[5..]; // Skip "rule_" - - if let Some(wc_pos) = rest.find("_wildcards_") { - let rule = &rest[..wc_pos]; - let wildcards = &rest[wc_pos + 11..]; // Skip "_wildcards_" - Some(( - rule.to_string(), - if wildcards.is_empty() { - None - } else { - Some(wildcards.to_string()) - }, - )) - } else { - Some((rest.to_string(), None)) - } -} - -/// Generate a job ID from rule and wildcards. -fn make_job_id(rule: &str, wildcards: Option<&str>) -> String { - match wildcards { - Some(wc) => format!("{}[{}]", rule, wc), - None => rule.to_string(), - } -} - -/// Convert Unix timestamp to DateTime. -fn timestamp_to_datetime(ts: f64) -> DateTime { - Utc.timestamp_opt(ts as i64, ((ts.fract()) * 1_000_000_000.0) as u32) - .single() - .unwrap_or_else(Utc::now) -} - -/// Merge snakemake metadata into pipeline state. -pub fn merge_snakemake_jobs(state: &mut PipelineState, jobs: Vec) { - for snakemake_job in jobs { - let meta = &snakemake_job.metadata; - - // Generate job ID from output path or rule - let job_id = snakemake_job.output_path.clone(); - - // Determine status from metadata - let status = if meta.incomplete { - JobStatus::Running - } else if meta.endtime.is_some() { - JobStatus::Completed - } else { - JobStatus::Pending - }; - - // Build timing - let timing = JobTiming { - queued_at: None, - started_at: Some(timestamp_to_datetime(meta.starttime)), - completed_at: meta.endtime.map(timestamp_to_datetime), - }; - - // Check if job already exists (from SLURM data) - if let Some(existing) = state.jobs.get_mut(&job_id) { - // Update with snakemake-specific data - existing.shellcmd = meta.shellcmd.clone(); - existing.inputs = meta.input.clone(); - existing.log_files = meta.log.clone(); - if existing.timing.started_at.is_none() { - existing.timing.started_at = timing.started_at; - } - if existing.timing.completed_at.is_none() { - existing.timing.completed_at = timing.completed_at; - } - existing.data_sources.has_snakemake_metadata = true; - } else { - // Create new job entry - let job = Job { - id: job_id.clone(), - rule: meta.rule.clone(), - wildcards: None, // Will be parsed from output path pattern - outputs: vec![snakemake_job.output_path.clone()], - inputs: meta.input.clone(), - status, - slurm_job_id: None, - shellcmd: meta.shellcmd.clone(), - timing, - resources: JobResources::default(), - log_files: meta.log.clone(), - error: None, - data_sources: DataSources { - has_snakemake_metadata: true, - has_slurm_squeue: false, - has_slurm_sacct: false, - }, - }; - state.jobs.insert(job_id.clone(), job); - - // Update jobs_by_rule index - state - .jobs_by_rule - .entry(meta.rule.clone()) - .or_default() - .push(job_id); - } - } - - state.last_updated = Utc::now(); -} - -/// Merge SLURM jobs into pipeline state. -pub fn merge_slurm_jobs(state: &mut PipelineState, jobs: Vec, from_sacct: bool) { - for slurm_job in jobs { - // Try to parse rule info from comment - let (rule, wildcards) = slurm_job - .comment - .as_ref() - .and_then(|c| parse_slurm_comment(c)) - .unwrap_or_else(|| (slurm_job.name.clone(), None)); - - let job_id = make_job_id(&rule, wildcards.as_deref()); - - // Update run_uuid if this is the first job - if state.run_uuid.is_none() { - state.run_uuid = Some(slurm_job.name.clone()); - } - - // Convert SLURM state to JobStatus - let status = match &slurm_job.state { - SlurmJobState::Pending => JobStatus::Queued, - SlurmJobState::Running => JobStatus::Running, - SlurmJobState::Completed { .. } => JobStatus::Completed, - SlurmJobState::Failed { .. } => JobStatus::Failed, - SlurmJobState::Cancelled => JobStatus::Cancelled, - SlurmJobState::Timeout => JobStatus::Failed, - SlurmJobState::OutOfMemory => JobStatus::Failed, - SlurmJobState::Unknown(_) => JobStatus::Unknown, - }; - - // Build error info - let error = match &slurm_job.state { - SlurmJobState::Failed { exit_code, error } => Some(JobError { - exit_code: *exit_code, - message: error.clone(), - }), - SlurmJobState::Timeout => Some(JobError { - exit_code: -1, - message: "Job exceeded time limit".to_string(), - }), - SlurmJobState::OutOfMemory => Some(JobError { - exit_code: -1, - message: "Job exceeded memory limit".to_string(), - }), - _ => None, - }; - - // Build timing - let timing = JobTiming { - queued_at: slurm_job.submit_time, - started_at: slurm_job.start_time, - completed_at: slurm_job.end_time, - }; - - // Build resources - let resources = JobResources { - cpus: slurm_job.cpus, - memory_mb: slurm_job.mem_mb, - time_limit: slurm_job.time_limit, - partition: slurm_job.partition.clone(), - node: slurm_job.nodelist.clone(), - }; - - // Check if job already exists - if let Some(existing) = state.jobs.get_mut(&job_id) { - // Update with SLURM data - existing.slurm_job_id = Some(slurm_job.job_id.clone()); - existing.status = status; - existing.resources = resources; - existing.error = error; - if existing.timing.queued_at.is_none() { - existing.timing.queued_at = timing.queued_at; - } - if from_sacct { - existing.data_sources.has_slurm_sacct = true; - } else { - existing.data_sources.has_slurm_squeue = true; - } - } else { - // Create new job entry - let job = Job { - id: job_id.clone(), - rule, - wildcards, - outputs: vec![], - inputs: vec![], - status, - slurm_job_id: Some(slurm_job.job_id.clone()), - shellcmd: String::new(), - timing, - resources, - log_files: vec![], - error, - data_sources: DataSources { - has_snakemake_metadata: false, - has_slurm_squeue: !from_sacct, - has_slurm_sacct: from_sacct, - }, - }; - - let rule_name = job.rule.clone(); - state.jobs.insert(job_id.clone(), job); - - // Update jobs_by_rule index - state - .jobs_by_rule - .entry(rule_name) - .or_default() - .push(job_id); - } - } - - state.last_updated = Utc::now(); -} - -/// Attempt to correlate jobs that couldn't be matched by comment field. -/// Uses timing windows and rule name matching. -pub fn correlate_jobs(state: &mut PipelineState) { - // Find jobs that have SLURM data but no snakemake metadata - let slurm_only: Vec<_> = state - .jobs - .values() - .filter(|j| j.data_sources.has_slurm_squeue && !j.data_sources.has_snakemake_metadata) - .map(|j| j.id.clone()) - .collect(); - - // Find jobs that have snakemake metadata but no SLURM data - let snakemake_only: Vec<_> = state - .jobs - .values() - .filter(|j| j.data_sources.has_snakemake_metadata && !j.data_sources.has_slurm_squeue) - .map(|j| j.id.clone()) - .collect(); - - // Try to match by rule name and timing - for slurm_id in slurm_only { - if let Some(slurm_job) = state.jobs.get(&slurm_id) { - let slurm_start = slurm_job.timing.started_at; - - for snakemake_id in &snakemake_only { - if let Some(sm_job) = state.jobs.get(snakemake_id) { - // Match by rule name - if slurm_job.rule != sm_job.rule { - continue; - } - - // Match by timing (within 60 second window) - if let (Some(slurm_t), Some(sm_t)) = (slurm_start, sm_job.timing.started_at) { - let diff = (slurm_t - sm_t).num_seconds().abs(); - if diff <= 60 { - // Found a match - merge the data - // In a real implementation, we'd merge these entries - // For now, just log the correlation - } - } - } - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_slurm_comment() { - // Basic rule only - let (rule, wc) = parse_slurm_comment("rule_align_reads").unwrap(); - assert_eq!(rule, "align_reads"); - assert!(wc.is_none()); - - // Rule with wildcards - let (rule, wc) = parse_slurm_comment("rule_align_reads_wildcards_sample=S1").unwrap(); - assert_eq!(rule, "align_reads"); - assert_eq!(wc.unwrap(), "sample=S1"); - - // Invalid format - assert!(parse_slurm_comment("not_a_rule").is_none()); - } - - #[test] - fn test_make_job_id() { - assert_eq!(make_job_id("align", None), "align"); - assert_eq!(make_job_id("align", Some("sample=S1")), "align[sample=S1]"); - } -} diff --git a/crates/charmer-state/src/merge/comment.rs b/crates/charmer-state/src/merge/comment.rs new file mode 100644 index 0000000..f8aef02 --- /dev/null +++ b/crates/charmer-state/src/merge/comment.rs @@ -0,0 +1,67 @@ +//! Comment/description field parsing for snakemake job correlation. + +/// Parse snakemake SLURM comment field: "rule_{rulename}_wildcards_{wildcards}" +pub fn parse_slurm_comment(comment: &str) -> Option<(String, Option)> { + // Format: "rule_RULENAME_wildcards_WILDCARDS" or just "rule_RULENAME" + if !comment.starts_with("rule_") { + return None; + } + + let rest = &comment[5..]; // Skip "rule_" + + if let Some(wc_pos) = rest.find("_wildcards_") { + let rule = &rest[..wc_pos]; + let wildcards = &rest[wc_pos + 11..]; // Skip "_wildcards_" + Some(( + rule.to_string(), + if wildcards.is_empty() { + None + } else { + Some(wildcards.to_string()) + }, + )) + } else { + Some((rest.to_string(), None)) + } +} + +/// Parse snakemake LSF job description field (same format as SLURM comment). +pub fn parse_lsf_description(desc: &str) -> Option<(String, Option)> { + // LSF snakemake executor uses same format as SLURM + parse_slurm_comment(desc) +} + +/// Generate a job ID from rule and wildcards. +pub fn make_job_id(rule: &str, wildcards: Option<&str>) -> String { + match wildcards { + Some(wc) => format!("{}[{}]", rule, wc), + None => rule.to_string(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_slurm_comment() { + // Basic rule only + let (rule, wc) = parse_slurm_comment("rule_align_reads").unwrap(); + assert_eq!(rule, "align_reads"); + assert!(wc.is_none()); + + // Rule with wildcards + let (rule, wc) = parse_slurm_comment("rule_align_reads_wildcards_sample=S1").unwrap(); + assert_eq!(rule, "align_reads"); + assert_eq!(wc.unwrap(), "sample=S1"); + + // Invalid format + assert!(parse_slurm_comment("not_a_rule").is_none()); + } + + #[test] + fn test_make_job_id() { + assert_eq!(make_job_id("align", None), "align"); + assert_eq!(make_job_id("align", Some("sample=S1")), "align[sample=S1]"); + } +} diff --git a/crates/charmer-state/src/merge/correlation.rs b/crates/charmer-state/src/merge/correlation.rs new file mode 100644 index 0000000..72796be --- /dev/null +++ b/crates/charmer-state/src/merge/correlation.rs @@ -0,0 +1,49 @@ +//! Job correlation between different data sources. + +use crate::types::PipelineState; + +/// Attempt to correlate jobs that couldn't be matched by comment field. +/// Uses timing windows and rule name matching. +pub fn correlate_jobs(state: &mut PipelineState) { + // Find jobs that have SLURM data but no snakemake metadata + let slurm_only: Vec<_> = state + .jobs + .values() + .filter(|j| j.data_sources.has_slurm_squeue && !j.data_sources.has_snakemake_metadata) + .map(|j| j.id.clone()) + .collect(); + + // Find jobs that have snakemake metadata but no SLURM data + let snakemake_only: Vec<_> = state + .jobs + .values() + .filter(|j| j.data_sources.has_snakemake_metadata && !j.data_sources.has_slurm_squeue) + .map(|j| j.id.clone()) + .collect(); + + // Try to match by rule name and timing + for slurm_id in slurm_only { + if let Some(slurm_job) = state.jobs.get(&slurm_id) { + let slurm_start = slurm_job.timing.started_at; + + for snakemake_id in &snakemake_only { + if let Some(sm_job) = state.jobs.get(snakemake_id) { + // Match by rule name + if slurm_job.rule != sm_job.rule { + continue; + } + + // Match by timing (within 60 second window) + if let (Some(slurm_t), Some(sm_t)) = (slurm_start, sm_job.timing.started_at) { + let diff = (slurm_t - sm_t).num_seconds().abs(); + if diff <= 60 { + // Found a match - merge the data + // In a real implementation, we'd merge these entries + // For now, just log the correlation + } + } + } + } + } + } +} diff --git a/crates/charmer-state/src/merge/lsf.rs b/crates/charmer-state/src/merge/lsf.rs new file mode 100644 index 0000000..3310f41 --- /dev/null +++ b/crates/charmer-state/src/merge/lsf.rs @@ -0,0 +1,101 @@ +//! LSF job merging into unified state. + +use super::comment::{make_job_id, parse_lsf_description}; +use crate::types::{DataSources, Job, JobResources, JobTiming, PipelineState, ToJobStatus}; +use charmer_lsf::LsfJob; +use chrono::Utc; + +/// Merge LSF jobs into pipeline state. +pub fn merge_lsf_jobs(state: &mut PipelineState, jobs: Vec, from_bhist: bool) { + for lsf_job in jobs { + // Try to parse rule info from description + let (rule, wildcards) = lsf_job + .description + .as_ref() + .and_then(|d| parse_lsf_description(d)) + .unwrap_or_else(|| (lsf_job.name.clone(), None)); + + let job_id = make_job_id(&rule, wildcards.as_deref()); + + // Update run_uuid if this is the first job + if state.run_uuid.is_none() { + state.run_uuid = Some(lsf_job.name.clone()); + } + + // Convert LSF state using the trait + let status = lsf_job.state.to_job_status(); + let error = lsf_job.state.to_job_error(); + + // Build timing + let timing = JobTiming { + queued_at: lsf_job.submit_time, + started_at: lsf_job.start_time, + completed_at: lsf_job.end_time, + }; + + // Build resources + let resources = JobResources { + cpus: lsf_job.nprocs, + memory_mb: lsf_job.mem_limit_mb, + time_limit: lsf_job.run_limit, + partition: lsf_job.queue.clone(), // LSF queue = SLURM partition + node: lsf_job.exec_host.clone(), + }; + + // Check if job already exists + if let Some(existing) = state.jobs.get_mut(&job_id) { + // Update with LSF data + existing.slurm_job_id = Some(lsf_job.job_id.clone()); // Reuse field for LSF job ID + existing.status = status; + existing.resources = resources; + existing.error = error; + if existing.timing.queued_at.is_none() { + existing.timing.queued_at = timing.queued_at; + } + if from_bhist { + existing.data_sources.has_lsf_bhist = true; + } else { + existing.data_sources.has_lsf_bjobs = true; + } + } else { + // Create new job entry + let job = Job { + id: job_id.clone(), + rule, + wildcards, + outputs: vec![], + inputs: vec![], + status, + slurm_job_id: Some(lsf_job.job_id.clone()), // Reuse for LSF job ID + shellcmd: String::new(), + timing, + resources, + usage: None, + log_files: vec![], + error, + conda_env: None, + container_img_url: None, + data_sources: DataSources { + has_snakemake_metadata: false, + has_slurm_squeue: false, + has_slurm_sacct: false, + has_lsf_bjobs: !from_bhist, + has_lsf_bhist: from_bhist, + }, + is_target: false, + }; + + let rule_name = job.rule.clone(); + state.jobs.insert(job_id.clone(), job); + + // Update jobs_by_rule index + state + .jobs_by_rule + .entry(rule_name) + .or_default() + .push(job_id); + } + } + + state.last_updated = Utc::now(); +} diff --git a/crates/charmer-state/src/merge/mod.rs b/crates/charmer-state/src/merge/mod.rs new file mode 100644 index 0000000..c1717fe --- /dev/null +++ b/crates/charmer-state/src/merge/mod.rs @@ -0,0 +1,13 @@ +//! Merge SLURM, LSF, and snakemake data into unified state. + +mod comment; +mod correlation; +mod lsf; +mod slurm; +mod snakemake; + +pub use comment::{make_job_id, parse_lsf_description, parse_slurm_comment}; +pub use correlation::correlate_jobs; +pub use lsf::merge_lsf_jobs; +pub use slurm::merge_slurm_jobs; +pub use snakemake::merge_snakemake_jobs; diff --git a/crates/charmer-state/src/merge/slurm.rs b/crates/charmer-state/src/merge/slurm.rs new file mode 100644 index 0000000..ed826fe --- /dev/null +++ b/crates/charmer-state/src/merge/slurm.rs @@ -0,0 +1,101 @@ +//! SLURM job merging into unified state. + +use super::comment::{make_job_id, parse_slurm_comment}; +use crate::types::{DataSources, Job, JobResources, JobTiming, PipelineState, ToJobStatus}; +use charmer_slurm::SlurmJob; +use chrono::Utc; + +/// Merge SLURM jobs into pipeline state. +pub fn merge_slurm_jobs(state: &mut PipelineState, jobs: Vec, from_sacct: bool) { + for slurm_job in jobs { + // Try to parse rule info from comment + let (rule, wildcards) = slurm_job + .comment + .as_ref() + .and_then(|c| parse_slurm_comment(c)) + .unwrap_or_else(|| (slurm_job.name.clone(), None)); + + let job_id = make_job_id(&rule, wildcards.as_deref()); + + // Update run_uuid if this is the first job + if state.run_uuid.is_none() { + state.run_uuid = Some(slurm_job.name.clone()); + } + + // Convert SLURM state using the trait + let status = slurm_job.state.to_job_status(); + let error = slurm_job.state.to_job_error(); + + // Build timing + let timing = JobTiming { + queued_at: slurm_job.submit_time, + started_at: slurm_job.start_time, + completed_at: slurm_job.end_time, + }; + + // Build resources + let resources = JobResources { + cpus: slurm_job.cpus, + memory_mb: slurm_job.mem_mb, + time_limit: slurm_job.time_limit, + partition: slurm_job.partition.clone(), + node: slurm_job.nodelist.clone(), + }; + + // Check if job already exists + if let Some(existing) = state.jobs.get_mut(&job_id) { + // Update with SLURM data + existing.slurm_job_id = Some(slurm_job.job_id.clone()); + existing.status = status; + existing.resources = resources; + existing.error = error; + if existing.timing.queued_at.is_none() { + existing.timing.queued_at = timing.queued_at; + } + if from_sacct { + existing.data_sources.has_slurm_sacct = true; + } else { + existing.data_sources.has_slurm_squeue = true; + } + } else { + // Create new job entry + let job = Job { + id: job_id.clone(), + rule, + wildcards, + outputs: vec![], + inputs: vec![], + status, + slurm_job_id: Some(slurm_job.job_id.clone()), + shellcmd: String::new(), + timing, + resources, + usage: None, + log_files: vec![], + error, + conda_env: None, + container_img_url: None, + data_sources: DataSources { + has_snakemake_metadata: false, + has_slurm_squeue: !from_sacct, + has_slurm_sacct: from_sacct, + has_lsf_bjobs: false, + has_lsf_bhist: false, + }, + is_target: false, + }; + + let rule_name = job.rule.clone(); + state.jobs.insert(job_id.clone(), job); + + // Update jobs_by_rule index + state + .jobs_by_rule + .entry(rule_name) + .or_default() + .push(job_id); + } + } + + state.last_updated = Utc::now(); +} diff --git a/crates/charmer-state/src/merge/snakemake.rs b/crates/charmer-state/src/merge/snakemake.rs new file mode 100644 index 0000000..ffddebe --- /dev/null +++ b/crates/charmer-state/src/merge/snakemake.rs @@ -0,0 +1,142 @@ +//! Snakemake metadata merging into unified state. + +use crate::types::{DataSources, Job, JobResources, JobStatus, JobTiming, PipelineState}; +use charmer_core::SnakemakeJob; +use chrono::{DateTime, TimeZone, Utc}; + +/// Convert Unix timestamp to DateTime. +fn timestamp_to_datetime(ts: f64) -> DateTime { + Utc.timestamp_opt(ts as i64, ((ts.fract()) * 1_000_000_000.0) as u32) + .single() + .unwrap_or_else(Utc::now) +} + +/// Extract wildcards from output path. +/// For paths like "results/aligned/sample1.bam" with rule "align_sample", +/// tries to extract sample=sample1 based on common patterns. +fn extract_wildcards(output_path: &str, _rule: &str) -> Option { + let parts: Vec<&str> = output_path.split('/').collect(); + if parts.len() < 2 { + return None; + } + + let mut wildcards = Vec::new(); + + // Get the filename without extension + if let Some(filename) = parts.last() { + let name_parts: Vec<&str> = filename.split('.').collect(); + if let Some(base_name) = name_parts.first() { + // Check for patterns like "sample1_chr1" -> sample=sample1, chrom=chr1 + if base_name.contains('_') { + let segments: Vec<&str> = base_name.split('_').collect(); + if segments.len() == 2 { + // Heuristic: first part is sample, second is something else (chrom, etc.) + if segments[1].starts_with("chr") { + wildcards.push(format!("sample={}", segments[0])); + wildcards.push(format!("chrom={}", segments[1])); + } else { + wildcards.push(format!("sample={}", segments[0])); + wildcards.push(format!("var={}", segments[1])); + } + } else if segments.len() == 1 { + wildcards.push(format!("sample={}", segments[0])); + } + } else { + // Simple case: just a sample name + wildcards.push(format!("sample={}", base_name)); + } + } + } + + if wildcards.is_empty() { + None + } else { + Some(wildcards.join(", ")) + } +} + +/// Merge snakemake metadata into pipeline state. +pub fn merge_snakemake_jobs(state: &mut PipelineState, jobs: Vec) { + for snakemake_job in jobs { + let meta = &snakemake_job.metadata; + + // Generate job ID from output path or rule + let job_id = snakemake_job.output_path.clone(); + + // Determine status from metadata + let status = if meta.incomplete { + JobStatus::Running + } else if meta.endtime.is_some() { + JobStatus::Completed + } else { + JobStatus::Pending + }; + + // Build timing + let timing = JobTiming { + queued_at: None, + started_at: Some(timestamp_to_datetime(meta.starttime)), + completed_at: meta.endtime.map(timestamp_to_datetime), + }; + + // Extract wildcards from output path + let wildcards = extract_wildcards(&snakemake_job.output_path, &meta.rule); + + // Check if job already exists (from SLURM data) + if let Some(existing) = state.jobs.get_mut(&job_id) { + // Update with snakemake-specific data + existing.shellcmd = meta.shellcmd.clone(); + existing.inputs = meta.input.clone(); + existing.log_files = meta.log.clone(); + existing.conda_env = meta.conda_env.clone(); + existing.container_img_url = meta.container_img_url.clone(); + if existing.wildcards.is_none() { + existing.wildcards = wildcards; + } + if existing.timing.started_at.is_none() { + existing.timing.started_at = timing.started_at; + } + if existing.timing.completed_at.is_none() { + existing.timing.completed_at = timing.completed_at; + } + existing.data_sources.has_snakemake_metadata = true; + } else { + // Create new job entry + let job = Job { + id: job_id.clone(), + rule: meta.rule.clone(), + wildcards, + outputs: vec![snakemake_job.output_path.clone()], + inputs: meta.input.clone(), + status, + slurm_job_id: None, + shellcmd: meta.shellcmd.clone(), + timing, + resources: JobResources::default(), + usage: None, + log_files: meta.log.clone(), + error: None, + conda_env: meta.conda_env.clone(), + container_img_url: meta.container_img_url.clone(), + data_sources: DataSources { + has_snakemake_metadata: true, + has_slurm_squeue: false, + has_slurm_sacct: false, + has_lsf_bjobs: false, + has_lsf_bhist: false, + }, + is_target: false, + }; + state.jobs.insert(job_id.clone(), job); + + // Update jobs_by_rule index + state + .jobs_by_rule + .entry(meta.rule.clone()) + .or_default() + .push(job_id); + } + } + + state.last_updated = Utc::now(); +} diff --git a/crates/charmer-state/src/types.rs b/crates/charmer-state/src/types.rs index cf44534..998fdc2 100644 --- a/crates/charmer-state/src/types.rs +++ b/crates/charmer-state/src/types.rs @@ -6,6 +6,9 @@ use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::time::Duration; +/// Special job ID for the main snakemake pipeline log. +pub const MAIN_PIPELINE_JOB_ID: &str = "__snakemake_main__"; + /// Unified job status. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum JobStatus { @@ -25,6 +28,79 @@ pub enum JobStatus { Unknown, } +/// Trait for converting scheduler-specific states to unified JobStatus. +pub trait ToJobStatus { + /// Convert to unified JobStatus. + fn to_job_status(&self) -> JobStatus; + + /// Extract error information if the job failed. + fn to_job_error(&self) -> Option; +} + +// Implementation for SLURM job states +impl ToJobStatus for charmer_slurm::SlurmJobState { + fn to_job_status(&self) -> JobStatus { + match self { + Self::Pending => JobStatus::Queued, + Self::Running => JobStatus::Running, + Self::Completed { .. } => JobStatus::Completed, + Self::Failed { .. } => JobStatus::Failed, + Self::Cancelled => JobStatus::Cancelled, + Self::Timeout => JobStatus::Failed, + Self::OutOfMemory => JobStatus::Failed, + Self::Unknown(_) => JobStatus::Unknown, + } + } + + fn to_job_error(&self) -> Option { + match self { + Self::Failed { exit_code, error } => Some(JobError { + exit_code: *exit_code, + message: error.clone(), + analysis: None, // Will be populated by failure analysis + }), + Self::Timeout => Some(JobError { + exit_code: -1, + message: "Job exceeded time limit".to_string(), + analysis: None, + }), + Self::OutOfMemory => Some(JobError { + exit_code: -1, + message: "Job exceeded memory limit".to_string(), + analysis: None, + }), + _ => None, + } + } +} + +// Implementation for LSF job states +impl ToJobStatus for charmer_lsf::LsfJobState { + fn to_job_status(&self) -> JobStatus { + match self { + Self::Pending => JobStatus::Queued, + Self::Running => JobStatus::Running, + Self::Done { .. } => JobStatus::Completed, + Self::Exit { .. } => JobStatus::Failed, + Self::UserSuspendedPending | Self::UserSuspended => JobStatus::Pending, + Self::SystemSuspended => JobStatus::Pending, + Self::Zombie => JobStatus::Unknown, + Self::Unknown(_) => JobStatus::Unknown, + } + } + + fn to_job_error(&self) -> Option { + match self { + Self::Exit { exit_code, error } => Some(JobError { + exit_code: *exit_code, + message: error.clone(), + analysis: None, // Will be populated by failure analysis + }), + _ => None, + } + } +} + impl JobStatus { pub fn symbol(&self) -> &'static str { match self { @@ -47,7 +123,7 @@ pub struct JobTiming { pub completed_at: Option>, } -/// Job resource allocation. +/// Job resource allocation (requested). #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct JobResources { pub cpus: Option, @@ -57,11 +133,333 @@ pub struct JobResources { pub node: Option, } +/// Actual resource usage (from sacct/bhist for finished jobs). +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ResourceUsage { + /// Maximum resident set size (actual memory used) in MB + pub max_rss_mb: Option, + /// Actual elapsed runtime in seconds + pub elapsed_seconds: Option, + /// Total CPU time in seconds + pub cpu_time_seconds: Option, +} + /// Job error information. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct JobError { pub exit_code: i32, pub message: String, + /// Detailed failure analysis (if available) + pub analysis: Option, +} + +/// Detailed failure analysis from SLURM/LSF. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FailureAnalysis { + /// Classified failure mode + pub mode: FailureMode, + /// Human-readable explanation + pub explanation: String, + /// Suggested fix + pub suggestion: String, + /// Memory used (MB) if available + pub memory_used_mb: Option, + /// Memory limit (MB) if available + pub memory_limit_mb: Option, + /// Runtime (seconds) if available + pub runtime_seconds: Option, + /// Time limit (seconds) if available + pub time_limit_seconds: Option, +} + +/// Failure mode classification. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum FailureMode { + /// Job ran out of memory + OutOfMemory, + /// Job exceeded time limit + Timeout, + /// Job failed with exit code + ExitCode, + /// Job was cancelled/killed + Cancelled, + /// Node/host failure + NodeFailure, + /// Unknown failure + Unknown, +} + +/// Execution environment type. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum EnvType { + /// Pixi environment (from `pixi run -e `) + Pixi, + /// Conda environment (from `conda run -n ` or conda_env metadata) + Conda, + /// Container (Singularity/Apptainer/Docker) + Container, + /// Direct shell execution (no environment wrapper) + Direct, +} + +/// Execution environment information. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ExecutionEnvironment { + /// Type of environment + pub env_type: EnvType, + /// Environment name (e.g., "myenv" for pixi/conda) + pub env_name: Option, + /// Container image URL (for containers) + pub image_url: Option, +} + +impl ExecutionEnvironment { + /// Detect execution environment from job metadata. + pub fn detect(shellcmd: &str, conda_env: Option<&str>, container_url: Option<&str>) -> Self { + // Priority: Container > Pixi > Conda > Direct + + // Check for container + if let Some(url) = container_url { + return Self { + env_type: EnvType::Container, + env_name: None, + image_url: Some(url.to_string()), + }; + } + + // Check shellcmd for container patterns + if let Some(image) = Self::detect_container(shellcmd) { + return Self { + env_type: EnvType::Container, + env_name: None, + image_url: Some(image), + }; + } + + // Check shellcmd for pixi pattern: `pixi run -e ` + if let Some(env_name) = Self::detect_pixi(shellcmd) { + return Self { + env_type: EnvType::Pixi, + env_name: Some(env_name), + image_url: None, + }; + } + + // Check for conda environment (from metadata or shellcmd) + if let Some(env) = conda_env { + return Self { + env_type: EnvType::Conda, + env_name: Some(env.to_string()), + image_url: None, + }; + } + + // Check shellcmd for conda pattern: `conda run -n ` + if let Some(env_name) = Self::detect_conda(shellcmd) { + return Self { + env_type: EnvType::Conda, + env_name: Some(env_name), + image_url: None, + }; + } + + // Default: direct execution + Self { + env_type: EnvType::Direct, + env_name: None, + image_url: None, + } + } + + /// Detect pixi environment from shell command. + fn detect_pixi(shellcmd: &str) -> Option { + // Pattern: `pixi run -e ` or `pixi run --environment ` + let patterns = [ + (r"pixi\s+run\s+-e\s+(\S+)", 1), + (r"pixi\s+run\s+--environment\s+(\S+)", 1), + ]; + + for (pattern, group) in patterns { + if let Ok(re) = regex::Regex::new(pattern) { + if let Some(caps) = re.captures(shellcmd) { + if let Some(m) = caps.get(group) { + return Some(m.as_str().to_string()); + } + } + } + } + None + } + + /// Detect conda environment from shell command. + fn detect_conda(shellcmd: &str) -> Option { + // Pattern: `conda run -n ` or `conda run --name ` + // Also: `mamba run -n ` or `micromamba run -n ` + let patterns = [ + (r"(?:conda|mamba|micromamba)\s+run\s+-n\s+(\S+)", 1), + (r"(?:conda|mamba|micromamba)\s+run\s+--name\s+(\S+)", 1), + (r"conda\s+activate\s+(\S+)", 1), + ]; + + for (pattern, group) in patterns { + if let Ok(re) = regex::Regex::new(pattern) { + if let Some(caps) = re.captures(shellcmd) { + if let Some(m) = caps.get(group) { + return Some(m.as_str().to_string()); + } + } + } + } + None + } + + /// Detect container from shell command. + fn detect_container(shellcmd: &str) -> Option { + // Pattern: `singularity exec ` or `docker run ` or `apptainer exec ` + let patterns = [ + (r"(?:singularity|apptainer)\s+exec\s+(\S+)", 1), + (r"docker\s+run\s+(?:[^/]+\s+)*(\S+/\S+)", 1), + ]; + + for (pattern, group) in patterns { + if let Ok(re) = regex::Regex::new(pattern) { + if let Some(caps) = re.captures(shellcmd) { + if let Some(m) = caps.get(group) { + return Some(m.as_str().to_string()); + } + } + } + } + None + } + + /// Get a display string for the environment. + pub fn display(&self) -> String { + match &self.env_type { + EnvType::Pixi => { + if let Some(name) = &self.env_name { + format!("pixi:{}", name) + } else { + "pixi".to_string() + } + } + EnvType::Conda => { + if let Some(name) = &self.env_name { + format!("conda:{}", name) + } else { + "conda".to_string() + } + } + EnvType::Container => { + if let Some(url) = &self.image_url { + // Truncate long URLs + if url.len() > 40 { + format!("container:...{}", &url[url.len() - 35..]) + } else { + format!("container:{}", url) + } + } else { + "container".to_string() + } + } + EnvType::Direct => "direct".to_string(), + } + } +} + +/// Pipeline error type classification. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum PipelineErrorType { + /// Missing input file(s) + MissingInput, + /// Shell command failed with exit code + CommandFailed, + /// Rule exception + RuleError, + /// Workflow-level error + WorkflowError, + /// Directory locked by another process + Locked, + /// Incomplete output files + IncompleteFiles, + /// Syntax error in Snakefile + SyntaxError, + /// Generic/unclassified error + Generic, +} + +/// Structured pipeline error. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PipelineError { + /// Error type classification + pub error_type: PipelineErrorType, + /// Rule name (if applicable) + pub rule: Option, + /// Primary error message + pub message: String, + /// Additional details (file paths, exit codes, etc.) + pub details: Vec, + /// Exit code (for command failures) + pub exit_code: Option, +} + +impl PipelineError { + /// Create a new pipeline error. + pub fn new(error_type: PipelineErrorType, message: impl Into) -> Self { + Self { + error_type, + rule: None, + message: message.into(), + details: Vec::new(), + exit_code: None, + } + } + + /// Add a rule name. + pub fn with_rule(mut self, rule: impl Into) -> Self { + self.rule = Some(rule.into()); + self + } + + /// Add detail. + pub fn with_detail(mut self, detail: impl Into) -> Self { + self.details.push(detail.into()); + self + } + + /// Add exit code. + pub fn with_exit_code(mut self, code: i32) -> Self { + self.exit_code = Some(code); + self + } + + /// Get icon for error type. + pub fn icon(&self) -> &'static str { + match self.error_type { + PipelineErrorType::MissingInput => "📁", + PipelineErrorType::CommandFailed => "💥", + PipelineErrorType::RuleError => "📋", + PipelineErrorType::WorkflowError => "⚙️", + PipelineErrorType::Locked => "🔒", + PipelineErrorType::IncompleteFiles => "⚠️", + PipelineErrorType::SyntaxError => "📝", + PipelineErrorType::Generic => "❌", + } + } + + /// Get short label for error type. + pub fn label(&self) -> &'static str { + match self.error_type { + PipelineErrorType::MissingInput => "Missing Input", + PipelineErrorType::CommandFailed => "Command Failed", + PipelineErrorType::RuleError => "Rule Error", + PipelineErrorType::WorkflowError => "Workflow Error", + PipelineErrorType::Locked => "Locked", + PipelineErrorType::IncompleteFiles => "Incomplete", + PipelineErrorType::SyntaxError => "Syntax Error", + PipelineErrorType::Generic => "Error", + } + } } /// Data source flags. @@ -70,6 +468,8 @@ pub struct DataSources { pub has_snakemake_metadata: bool, pub has_slurm_squeue: bool, pub has_slurm_sacct: bool, + pub has_lsf_bjobs: bool, + pub has_lsf_bhist: bool, } /// Unified job combining SLURM and snakemake data. @@ -102,17 +502,29 @@ pub struct Job { /// Timing information pub timing: JobTiming, - /// Resource allocation + /// Resource allocation (requested) pub resources: JobResources, + /// Actual resource usage (for finished jobs) + pub usage: Option, + /// Log file paths pub log_files: Vec, /// Error details (if failed) pub error: Option, + /// Conda environment (from snakemake metadata) + pub conda_env: Option, + + /// Container image URL (from snakemake metadata) + pub container_img_url: Option, + /// Data sources pub data_sources: DataSources, + + /// Whether this is a target rule (no outputs, like "all") + pub is_target: bool, } /// Pipeline-level state. @@ -132,6 +544,21 @@ pub struct PipelineState { /// Last update timestamp pub last_updated: DateTime, + + /// Total jobs from snakemake log (if known) + pub total_jobs: Option, + + /// Number of cores being used + pub cores: Option, + + /// Host machine name + pub host: Option, + + /// Whether the pipeline has finished + pub pipeline_finished: bool, + + /// Pipeline-level errors from main log (structured) + pub pipeline_errors: Vec, } impl PipelineState { @@ -142,6 +569,105 @@ impl PipelineState { jobs: HashMap::new(), jobs_by_rule: HashMap::new(), last_updated: Utc::now(), + total_jobs: None, + cores: None, + host: None, + pipeline_finished: false, + pipeline_errors: Vec::new(), + } + } + + /// Update pipeline state from snakemake log info. + pub fn update_from_log_info(&mut self, info: &charmer_core::SnakemakeLogInfo) { + if info.total_jobs.is_some() { + self.total_jobs = info.total_jobs; + } + if info.cores.is_some() { + self.cores = info.cores; + } + if info.host.is_some() { + self.host = info.host.clone(); + } + self.pipeline_finished = info.finished; + if !info.errors.is_empty() { + self.pipeline_errors = info.errors.iter().map(|s| parse_error_string(s)).collect(); + } + + // Create synthetic jobs for target rules (rules without output files) + // These rules appear in jobs_by_rule from the log but won't have metadata files + // since they have no output files (e.g., "all" rule that just aggregates targets). + // We detect these by parsing rule blocks in the log and checking for output: lines. + for rule in &info.target_rules { + if !self.jobs_by_rule.contains_key(rule) { + let job_id = format!("__target_{}__", rule); + let status = if info.finished && self.pipeline_errors.is_empty() { + JobStatus::Completed + } else if info.finished { + JobStatus::Failed + } else { + JobStatus::Pending + }; + + let job = Job { + id: job_id.clone(), + rule: rule.clone(), + wildcards: None, + outputs: Vec::new(), + inputs: Vec::new(), + status, + slurm_job_id: None, + shellcmd: String::new(), + timing: JobTiming::default(), + resources: JobResources::default(), + usage: None, + log_files: Vec::new(), + error: None, + conda_env: None, + container_img_url: None, + data_sources: DataSources::default(), + is_target: true, + }; + self.jobs.insert(job_id.clone(), job); + self.jobs_by_rule.insert(rule.clone(), vec![job_id]); + } + } + + // Update status of existing target jobs based on pipeline state + for rule in &info.target_rules { + let job_id = format!("__target_{}__", rule); + if let Some(job) = self.jobs.get_mut(&job_id) { + if job.is_target { + job.status = if info.finished && self.pipeline_errors.is_empty() { + JobStatus::Completed + } else if info.finished { + JobStatus::Failed + } else { + JobStatus::Pending + }; + // Update timing when pipeline finishes + if info.finished && job.timing.completed_at.is_none() { + job.timing.completed_at = Some(chrono::Utc::now()); + } + } + } + + // Also update any regular jobs for this target rule (they have no outputs) + if let Some(job_ids) = self.jobs_by_rule.get(rule).cloned() { + for job_id in job_ids { + if let Some(job) = self.jobs.get_mut(&job_id) { + if job.outputs.is_empty() { + job.is_target = true; + job.status = if info.finished && self.pipeline_errors.is_empty() { + JobStatus::Completed + } else if info.finished { + JobStatus::Failed + } else { + JobStatus::Pending + }; + } + } + } + } } } @@ -161,6 +687,84 @@ impl PipelineState { counts.total = self.jobs.len(); counts } + + /// Estimate time remaining for the pipeline to complete. + /// Returns (estimated_seconds, is_reliable) where is_reliable indicates + /// if we have enough completed jobs to make a good estimate. + pub fn estimate_eta(&self) -> Option<(u64, bool)> { + let counts = self.job_counts(); + let total = self.total_jobs.unwrap_or(counts.total); + + // Need at least some completed jobs to estimate + if counts.completed == 0 { + return None; + } + + // Calculate average runtime from completed jobs + let mut total_runtime_secs: u64 = 0; + let mut completed_with_timing = 0; + + for job in self.jobs.values() { + if job.status == JobStatus::Completed { + if let (Some(start), Some(end)) = (job.timing.started_at, job.timing.completed_at) { + let runtime = (end - start).num_seconds().max(0) as u64; + total_runtime_secs += runtime; + completed_with_timing += 1; + } + } + } + + if completed_with_timing == 0 { + return None; + } + + let avg_runtime = total_runtime_secs / completed_with_timing as u64; + + // Calculate remaining work + let remaining = total.saturating_sub(counts.completed); + let running = counts.running; + + // Estimate for running jobs: average half their expected time remaining + let running_contribution = if running > 0 { + // Assume running jobs are on average halfway done + (running as u64 * avg_runtime) / 2 + } else { + 0 + }; + + // Estimate for pending jobs + let pending_contribution = remaining.saturating_sub(running) as u64 * avg_runtime; + + // Total estimate (note: this assumes serial execution, actual time depends on parallelism) + let estimate = running_contribution + pending_contribution; + + // Reliability: we have enough data if at least 20% of jobs are completed + let is_reliable = counts.completed > 2 && (counts.completed * 5) >= total; + + Some((estimate, is_reliable)) + } + + /// Get ETA as a formatted string. + pub fn eta_string(&self) -> Option { + self.estimate_eta().map(|(secs, reliable)| { + let time_str = if secs >= 3600 { + let hours = secs / 3600; + let mins = (secs % 3600) / 60; + format!("{}h{}m", hours, mins) + } else if secs >= 60 { + let mins = secs / 60; + format!("{}m", mins) + } else { + format!("{}s", secs) + }; + + if reliable { + format!("~{}", time_str) + } else { + format!("~{}?", time_str) // Add ? to indicate uncertainty + } + }) + } } #[derive(Debug, Clone, Default)] @@ -174,3 +778,134 @@ pub struct JobCounts { pub cancelled: usize, pub unknown: usize, } + +/// Parse a raw error string into a structured PipelineError. +fn parse_error_string(error: &str) -> PipelineError { + let error_lower = error.to_lowercase(); + + // MissingInputException + if error_lower.contains("missinginputexception") || error_lower.contains("missing input") { + let mut pe = PipelineError::new(PipelineErrorType::MissingInput, error.to_string()); + // Try to extract rule name + if let Some(rule) = extract_rule_from_error(error) { + pe = pe.with_rule(rule); + } + // Try to extract file paths + for line in error.lines() { + let trimmed = line.trim(); + if trimmed.starts_with('/') || trimmed.contains("results/") || trimmed.contains("data/") + { + pe = pe.with_detail(trimmed.to_string()); + } + } + return pe; + } + + // CalledProcessError / command failed + if error_lower.contains("calledprocesserror") + || error_lower.contains("error executing rule") + || error_lower.contains("error in rule") + { + let mut pe = PipelineError::new(PipelineErrorType::CommandFailed, error.to_string()); + if let Some(rule) = extract_rule_from_error(error) { + pe = pe.with_rule(rule); + } + // Try to extract exit code + if let Some(code) = extract_exit_code(error) { + pe = pe.with_exit_code(code); + } + return pe; + } + + // Lock exception + if error_lower.contains("lockexception") || error_lower.contains("directory cannot be locked") { + return PipelineError::new(PipelineErrorType::Locked, error.to_string()); + } + + // Incomplete files + if error_lower.contains("incompletefilesexception") || error_lower.contains("incomplete") { + let mut pe = PipelineError::new(PipelineErrorType::IncompleteFiles, error.to_string()); + for line in error.lines() { + let trimmed = line.trim(); + if trimmed.starts_with('/') || trimmed.contains("results/") { + pe = pe.with_detail(trimmed.to_string()); + } + } + return pe; + } + + // Syntax error + if error_lower.contains("syntaxerror") || error_lower.contains("syntax error") { + return PipelineError::new(PipelineErrorType::SyntaxError, error.to_string()); + } + + // Workflow error + if error_lower.contains("workflowerror") || error_lower.contains("workflow error") { + return PipelineError::new(PipelineErrorType::WorkflowError, error.to_string()); + } + + // Rule exception + if error_lower.contains("ruleexception") { + let mut pe = PipelineError::new(PipelineErrorType::RuleError, error.to_string()); + if let Some(rule) = extract_rule_from_error(error) { + pe = pe.with_rule(rule); + } + return pe; + } + + // Generic error + let mut pe = PipelineError::new(PipelineErrorType::Generic, error.to_string()); + if let Some(rule) = extract_rule_from_error(error) { + pe = pe.with_rule(rule); + } + pe +} + +/// Extract rule name from error message. +fn extract_rule_from_error(error: &str) -> Option { + // Pattern: "rule " or "Rule: " or "Error in rule " + let patterns = [ + r"(?i)error in rule\s+(\w+)", + r"(?i)rule[:\s]+(\w+)", + r"(?i)for rule\s+(\w+)", + ]; + + for pattern in patterns { + if let Ok(re) = regex::Regex::new(pattern) { + if let Some(caps) = re.captures(error) { + if let Some(m) = caps.get(1) { + let rule = m.as_str(); + // Skip common false positives + if rule != "the" && rule != "a" && rule != "an" { + return Some(rule.to_string()); + } + } + } + } + } + None +} + +/// Extract exit code from error message. +fn extract_exit_code(error: &str) -> Option { + // Pattern: "exit code: N" or "exitcode: N" or "return code N" + let patterns = [ + r"(?i)exit\s*code[:\s]+(\d+)", + r"(?i)exitcode[:\s]+(\d+)", + r"(?i)return\s*code[:\s]+(\d+)", + r"(?i)returned\s+(\d+)", + ]; + + for pattern in patterns { + if let Ok(re) = regex::Regex::new(pattern) { + if let Some(caps) = re.captures(error) { + if let Some(m) = caps.get(1) { + if let Ok(code) = m.as_str().parse() { + return Some(code); + } + } + } + } + } + None +} diff --git a/crates/charmer/Cargo.toml b/crates/charmer/Cargo.toml index 5ec261c..8fd95c0 100644 --- a/crates/charmer/Cargo.toml +++ b/crates/charmer/Cargo.toml @@ -12,6 +12,7 @@ path = "src/main.rs" charmer-cli.workspace = true charmer-core.workspace = true charmer-slurm.workspace = true +charmer-lsf.workspace = true charmer-state.workspace = true charmer-monitor.workspace = true clap = { workspace = true, features = ["derive"] } @@ -19,3 +20,6 @@ tokio.workspace = true miette.workspace = true crossterm.workspace = true ratatui.workspace = true +notify.workspace = true +camino.workspace = true +chrono.workspace = true diff --git a/crates/charmer/src/main.rs b/crates/charmer/src/main.rs index 46dffd7..e2a2ddb 100644 --- a/crates/charmer/src/main.rs +++ b/crates/charmer/src/main.rs @@ -1,25 +1,211 @@ -//! Charmer - Snakemake pipeline monitor for SLURM. +//! Charmer - Snakemake pipeline monitor for SLURM/LSF. + +mod polling; +mod watcher; use charmer_cli::Args; +use charmer_core::{parse_main_log, parse_metadata_file, scan_metadata_dir}; use charmer_monitor::App; -use charmer_state::PipelineState; +use charmer_state::{merge_snakemake_jobs, PipelineState}; use clap::Parser; -use miette::Result; +use crossterm::{ + execute, + terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen}, +}; +use miette::{IntoDiagnostic, Result}; +use polling::{init_polling, PollingConfig}; +use ratatui::prelude::*; +use std::collections::HashMap; +use std::io; +use std::sync::Arc; +use std::time::Duration; +use tokio::sync::Mutex; +use watcher::{MetadataWatcher, WatcherEvent}; -fn main() -> Result<()> { +#[tokio::main] +async fn main() -> Result<()> { let args = Args::parse(); - // Initialize pipeline state - let state = PipelineState::new(args.dir.clone()); - let mut app = App::new(state); + // Initialize pipeline state wrapped in Arc> for sharing with polling service + let state = Arc::new(Mutex::new(PipelineState::new(args.dir.clone()))); + + // Scan existing metadata files on startup, filtering to recent jobs + if let Ok(existing_jobs) = scan_metadata_dir(&args.dir) { + let cutoff = chrono::Utc::now() - chrono::Duration::hours(args.history_hours as i64); + let recent_jobs: Vec<_> = existing_jobs + .into_iter() + .filter(|job| { + // Keep jobs that are incomplete (still running) or started recently + job.metadata.incomplete || job.metadata.starttime > cutoff.timestamp() as f64 + }) + .collect(); + + if !recent_jobs.is_empty() { + let mut state_guard = state.lock().await; + merge_snakemake_jobs(&mut state_guard, recent_jobs); + } + } + + // Initialize polling service in the background + let poll_config = PollingConfig { + active_poll_interval: Duration::from_secs(args.poll_interval), + history_poll_interval: Duration::from_secs(30), + run_uuid: args.run_uuid.clone(), + history_hours: args.history_hours, + }; + + let _polling_handle = init_polling(Arc::clone(&state), poll_config).await; + + // Initialize app with a clone of the initial state + let initial_state = { + let state_guard = state.lock().await; + state_guard.clone() + }; + let mut app = App::new(initial_state); + app.update_job_list(); + + // Setup terminal + enable_raw_mode().into_diagnostic()?; + let mut stdout = io::stdout(); + // Note: We don't enable mouse capture to allow text selection with trackpad + execute!(stdout, EnterAlternateScreen).into_diagnostic()?; + let backend = CrosstermBackend::new(stdout); + let mut terminal = Terminal::new(backend).into_diagnostic()?; - println!("charmer - monitoring {}", args.dir); - println!("Poll interval: {}s", args.poll_interval); - println!("Theme: {}", args.theme); + // Create file watcher + let watcher = MetadataWatcher::new(&args.dir).ok(); - // TODO: Start TUI event loop - // TODO: Start SLURM polling loop - // TODO: Start metadata file watcher + // Run the main loop + let res = run_app(&mut terminal, &mut app, state, watcher).await; + + // Restore terminal + disable_raw_mode().into_diagnostic()?; + execute!(terminal.backend_mut(), LeaveAlternateScreen).into_diagnostic()?; + terminal.show_cursor().into_diagnostic()?; + + // Handle result + if let Err(err) = res { + eprintln!("Error: {}", err); + } Ok(()) } + +/// Main application loop. +async fn run_app( + terminal: &mut Terminal>, + app: &mut App, + shared_state: Arc>, + watcher: Option, +) -> io::Result<()> { + let tick_rate = Duration::from_millis(100); + let update_interval = Duration::from_millis(500); + let rescan_interval = Duration::from_secs(2); // Periodic re-scan fallback + let log_parse_interval = Duration::from_secs(1); // Parse main log frequently for progress + + let mut last_update = std::time::Instant::now(); + let mut last_rescan = std::time::Instant::now(); + let mut last_log_parse = std::time::Instant::now(); + let mut debounce_map: HashMap = HashMap::new(); + let debounce_duration = Duration::from_millis(500); + + loop { + // Periodically sync app state from shared state (updated by polling service) + if last_update.elapsed() >= update_interval { + let state_guard = shared_state.lock().await; + app.update_from_state(state_guard.clone()); + drop(state_guard); + last_update = std::time::Instant::now(); + } + + // Periodically parse main snakemake log for pipeline-level info + if last_log_parse.elapsed() >= log_parse_interval { + let state_guard = shared_state.lock().await; + let working_dir = state_guard.working_dir.clone(); + drop(state_guard); + + if let Ok(log_info) = parse_main_log(&working_dir) { + let mut state_guard = shared_state.lock().await; + state_guard.update_from_log_info(&log_info); + } + last_log_parse = std::time::Instant::now(); + } + + // Draw UI + terminal.draw(|frame| app.render(frame))?; + + // Handle keyboard events (non-blocking) + if app.poll_events(tick_rate)? { + // Event was handled + } + + // Check for file watcher events (non-blocking) + if let Some(ref w) = watcher { + while let Some(event) = w.try_recv_nonblocking() { + match event { + WatcherEvent::MetadataFile(path) => { + // Debounce rapid changes to the same file + let path_str = path.to_string(); + let now = std::time::Instant::now(); + + if let Some(last_time) = debounce_map.get(&path_str) { + if now.duration_since(*last_time) < debounce_duration { + continue; // Skip this event - too soon + } + } + + debounce_map.insert(path_str, now); + + // Parse and merge the metadata file + if let Ok(job) = parse_metadata_file(&path) { + let mut state_guard = shared_state.lock().await; + merge_snakemake_jobs(&mut state_guard, vec![job]); + drop(state_guard); + } + } + WatcherEvent::MetadataDirectoryCreated => { + // Metadata directory was just created - scan for any existing files + let state_guard = shared_state.lock().await; + let working_dir = state_guard.working_dir.clone(); + drop(state_guard); + + if let Ok(jobs) = scan_metadata_dir(&working_dir) { + let mut state_guard = shared_state.lock().await; + merge_snakemake_jobs(&mut state_guard, jobs); + } + } + WatcherEvent::Error(err) => { + eprintln!("File watcher error: {}", err); + } + } + } + + // Clean up old debounce entries (keep map from growing unbounded) + let now = std::time::Instant::now(); + debounce_map.retain(|_, time| now.duration_since(*time) < debounce_duration * 10); + } + + // Periodic re-scan as fallback (in case file watcher misses events) + if last_rescan.elapsed() >= rescan_interval { + let state_guard = shared_state.lock().await; + let working_dir = state_guard.working_dir.clone(); + drop(state_guard); + + if let Ok(jobs) = scan_metadata_dir(&working_dir) { + if !jobs.is_empty() { + let mut state_guard = shared_state.lock().await; + merge_snakemake_jobs(&mut state_guard, jobs); + } + } + last_rescan = std::time::Instant::now(); + } + + // Check if we should quit + if app.should_quit { + return Ok(()); + } + + // Small sleep to prevent CPU spinning + tokio::time::sleep(Duration::from_millis(10)).await; + } +} diff --git a/crates/charmer/src/polling.rs b/crates/charmer/src/polling.rs new file mode 100644 index 0000000..a4412ce --- /dev/null +++ b/crates/charmer/src/polling.rs @@ -0,0 +1,418 @@ +//! Background polling service for SLURM and LSF schedulers. + +use charmer_lsf::{query_bhist, query_bjobs}; +use charmer_slurm::{query_resource_usage, query_sacct, query_squeue}; +use charmer_state::{ + merge_lsf_jobs, merge_slurm_jobs, FailureAnalysis, FailureMode, JobStatus, PipelineState, + ResourceUsage, +}; +use chrono::Utc; +use std::sync::Arc; +use std::time::Duration; +use tokio::sync::Mutex; +use tokio::time::interval; + +/// Scheduler type detection. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SchedulerType { + Slurm, + Lsf, +} + +/// Detect which scheduler is available. +pub async fn detect_scheduler() -> Option { + // Try SLURM first + if tokio::process::Command::new("squeue") + .arg("--version") + .output() + .await + .map(|o| o.status.success()) + .unwrap_or(false) + { + return Some(SchedulerType::Slurm); + } + + // Try LSF + if tokio::process::Command::new("bjobs") + .arg("-V") + .output() + .await + .map(|o| o.status.success()) + .unwrap_or(false) + { + return Some(SchedulerType::Lsf); + } + + None +} + +/// Configuration for the polling service. +#[derive(Debug, Clone)] +pub struct PollingConfig { + /// Interval for polling active jobs (squeue/bjobs). + pub active_poll_interval: Duration, + /// Interval for polling historical jobs (sacct/bhist). + pub history_poll_interval: Duration, + /// Run UUID filter (optional). + pub run_uuid: Option, + /// Hours of history to fetch. + pub history_hours: u64, +} + +impl Default for PollingConfig { + fn default() -> Self { + Self { + active_poll_interval: Duration::from_secs(5), + history_poll_interval: Duration::from_secs(30), + run_uuid: None, + history_hours: 24, + } + } +} + +/// Polling service that runs in the background. +pub struct PollingService { + state: Arc>, + config: PollingConfig, + scheduler: SchedulerType, +} + +impl PollingService { + pub fn new( + state: Arc>, + config: PollingConfig, + scheduler: SchedulerType, + ) -> Self { + Self { + state, + config, + scheduler, + } + } + + /// Start the polling service in the background. + pub fn start(self) -> tokio::task::JoinHandle<()> { + tokio::spawn(async move { + self.run().await; + }) + } + + /// Main polling loop. + async fn run(self) { + let mut active_ticker = interval(self.config.active_poll_interval); + let mut history_ticker = interval(self.config.history_poll_interval); + + // Skip the first tick (fires immediately) + active_ticker.tick().await; + history_ticker.tick().await; + + loop { + tokio::select! { + _ = active_ticker.tick() => { + self.poll_active_jobs().await; + } + _ = history_ticker.tick() => { + self.poll_historical_jobs().await; + } + } + } + } + + /// Poll active jobs (squeue or bjobs). + async fn poll_active_jobs(&self) { + match self.scheduler { + SchedulerType::Slurm => { + if let Err(e) = self.poll_squeue().await { + eprintln!("Error polling squeue: {}", e); + } + } + SchedulerType::Lsf => { + if let Err(e) = self.poll_bjobs().await { + eprintln!("Error polling bjobs: {}", e); + } + } + } + } + + /// Poll historical jobs (sacct or bhist). + async fn poll_historical_jobs(&self) { + match self.scheduler { + SchedulerType::Slurm => { + if let Err(e) = self.poll_sacct().await { + eprintln!("Error polling sacct: {}", e); + } + } + SchedulerType::Lsf => { + if let Err(e) = self.poll_bhist().await { + eprintln!("Error polling bhist: {}", e); + } + } + } + } + + /// Poll SLURM squeue. + async fn poll_squeue(&self) -> Result<(), Box> { + let run_uuid = self.config.run_uuid.as_deref(); + let jobs = query_squeue(run_uuid).await?; + + let mut state = self.state.lock().await; + merge_slurm_jobs(&mut state, jobs, false); + + Ok(()) + } + + /// Poll SLURM sacct. + async fn poll_sacct(&self) -> Result<(), Box> { + let run_uuid = self.config.run_uuid.as_deref(); + let since = Some(Utc::now() - chrono::Duration::hours(self.config.history_hours as i64)); + let jobs = query_sacct(run_uuid, since).await?; + + let mut state = self.state.lock().await; + merge_slurm_jobs(&mut state, jobs, true); + + // Enrich failed jobs with failure analysis + self.enrich_failed_jobs_slurm(&mut state).await; + + // Enrich completed jobs with resource usage + self.enrich_completed_jobs_slurm(&mut state).await; + + Ok(()) + } + + /// Poll LSF bjobs. + async fn poll_bjobs(&self) -> Result<(), Box> { + let job_name_filter = self.config.run_uuid.as_deref(); + let jobs = query_bjobs(job_name_filter).await?; + + let mut state = self.state.lock().await; + merge_lsf_jobs(&mut state, jobs, false); + + Ok(()) + } + + /// Poll LSF bhist. + async fn poll_bhist(&self) -> Result<(), Box> { + let job_name_filter = self.config.run_uuid.as_deref(); + let since = Some(Utc::now() - chrono::Duration::hours(self.config.history_hours as i64)); + let jobs = query_bhist(job_name_filter, since).await?; + + let mut state = self.state.lock().await; + merge_lsf_jobs(&mut state, jobs, true); + + // Enrich failed jobs with failure analysis + self.enrich_failed_jobs_lsf(&mut state).await; + + Ok(()) + } + + /// Enrich failed SLURM jobs with detailed failure analysis. + async fn enrich_failed_jobs_slurm(&self, state: &mut PipelineState) { + // Collect job IDs that need failure analysis + let jobs_needing_analysis: Vec<(String, String)> = state + .jobs + .iter() + .filter(|(_, job)| { + job.status == JobStatus::Failed + && job.slurm_job_id.is_some() + && job + .error + .as_ref() + .map(|e| e.analysis.is_none()) + .unwrap_or(true) + }) + .map(|(id, job)| (id.clone(), job.slurm_job_id.clone().unwrap())) + .take(5) // Limit to avoid too many queries + .collect(); + + // Analyze each failed job + for (job_id, slurm_job_id) in jobs_needing_analysis { + if let Ok(analysis) = charmer_slurm::analyze_failure(&slurm_job_id).await { + if let Some(job) = state.jobs.get_mut(&job_id) { + // Convert SLURM analysis to unified format + let unified_analysis = convert_slurm_analysis(&analysis); + + if let Some(ref mut error) = job.error { + error.analysis = Some(unified_analysis); + } else { + // Create error with analysis + job.error = Some(charmer_state::JobError { + exit_code: match &analysis.mode { + charmer_slurm::FailureMode::ExitCode { code, .. } => *code, + _ => -1, + }, + message: analysis.explanation.clone(), + analysis: Some(unified_analysis), + }); + } + } + } + } + } + + /// Enrich completed SLURM jobs with resource usage data. + async fn enrich_completed_jobs_slurm(&self, state: &mut PipelineState) { + // Collect job IDs that need resource usage data + let jobs_needing_usage: Vec<(String, String)> = state + .jobs + .iter() + .filter(|(_, job)| { + // Get usage for completed and failed jobs that don't have it yet + matches!(job.status, JobStatus::Completed | JobStatus::Failed) + && job.slurm_job_id.is_some() + && job.usage.is_none() + }) + .map(|(id, job)| (id.clone(), job.slurm_job_id.clone().unwrap())) + .take(10) // Limit to avoid too many queries per poll + .collect(); + + // Query resource usage for each job + for (job_id, slurm_job_id) in jobs_needing_usage { + if let Ok(Some(usage)) = query_resource_usage(&slurm_job_id).await { + if let Some(job) = state.jobs.get_mut(&job_id) { + job.usage = Some(ResourceUsage { + max_rss_mb: usage.max_rss_mb, + elapsed_seconds: usage.elapsed_seconds, + cpu_time_seconds: usage.cpu_time_seconds, + }); + } + } + } + } + + /// Enrich failed LSF jobs with detailed failure analysis. + async fn enrich_failed_jobs_lsf(&self, state: &mut PipelineState) { + // Collect job IDs that need failure analysis + let jobs_needing_analysis: Vec<(String, String)> = state + .jobs + .iter() + .filter(|(_, job)| { + job.status == JobStatus::Failed + && job.slurm_job_id.is_some() // LSF also uses slurm_job_id field + && job.error.as_ref().map(|e| e.analysis.is_none()).unwrap_or(true) + }) + .map(|(id, job)| (id.clone(), job.slurm_job_id.clone().unwrap())) + .take(5) // Limit to avoid too many queries + .collect(); + + // Analyze each failed job + for (job_id, lsf_job_id) in jobs_needing_analysis { + if let Ok(analysis) = charmer_lsf::analyze_failure(&lsf_job_id).await { + if let Some(job) = state.jobs.get_mut(&job_id) { + // Convert LSF analysis to unified format + let unified_analysis = convert_lsf_analysis(&analysis); + + if let Some(ref mut error) = job.error { + error.analysis = Some(unified_analysis); + } else { + // Create error with analysis + job.error = Some(charmer_state::JobError { + exit_code: match &analysis.mode { + charmer_lsf::FailureMode::ExitCode { code, .. } => *code, + _ => -1, + }, + message: analysis.explanation.clone(), + analysis: Some(unified_analysis), + }); + } + } + } + } + } +} + +/// Convert SLURM failure analysis to unified format. +fn convert_slurm_analysis(analysis: &charmer_slurm::FailureAnalysis) -> FailureAnalysis { + let mode = match &analysis.mode { + charmer_slurm::FailureMode::OutOfMemory { .. } => FailureMode::OutOfMemory, + charmer_slurm::FailureMode::Timeout { .. } => FailureMode::Timeout, + charmer_slurm::FailureMode::ExitCode { .. } => FailureMode::ExitCode, + charmer_slurm::FailureMode::Cancelled { .. } => FailureMode::Cancelled, + charmer_slurm::FailureMode::NodeFailure { .. } => FailureMode::NodeFailure, + charmer_slurm::FailureMode::Unknown { .. } => FailureMode::Unknown, + }; + + let (memory_used_mb, memory_limit_mb) = match &analysis.mode { + charmer_slurm::FailureMode::OutOfMemory { + used_mb, + requested_mb, + .. + } => (Some(*used_mb), Some(*requested_mb)), + _ => (analysis.max_rss_mb, analysis.req_mem_mb), + }; + + let (runtime_seconds, time_limit_seconds) = match &analysis.mode { + charmer_slurm::FailureMode::Timeout { + elapsed_seconds, + limit_seconds, + .. + } => (Some(*elapsed_seconds), Some(*limit_seconds)), + _ => (analysis.elapsed_seconds, analysis.time_limit_seconds), + }; + + FailureAnalysis { + mode, + explanation: analysis.explanation.clone(), + suggestion: analysis.suggestion.clone(), + memory_used_mb, + memory_limit_mb, + runtime_seconds, + time_limit_seconds, + } +} + +/// Convert LSF failure analysis to unified format. +fn convert_lsf_analysis(analysis: &charmer_lsf::FailureAnalysis) -> FailureAnalysis { + let mode = match &analysis.mode { + charmer_lsf::FailureMode::OutOfMemory { .. } => FailureMode::OutOfMemory, + charmer_lsf::FailureMode::Timeout { .. } => FailureMode::Timeout, + charmer_lsf::FailureMode::ExitCode { .. } => FailureMode::ExitCode, + charmer_lsf::FailureMode::Killed { .. } => FailureMode::Cancelled, + charmer_lsf::FailureMode::HostFailure { .. } => FailureMode::NodeFailure, + charmer_lsf::FailureMode::Unknown { .. } => FailureMode::Unknown, + }; + + let (memory_used_mb, memory_limit_mb) = match &analysis.mode { + charmer_lsf::FailureMode::OutOfMemory { + used_mb, limit_mb, .. + } => (Some(*used_mb), Some(*limit_mb)), + _ => (analysis.max_mem_mb, analysis.mem_limit_mb), + }; + + let (runtime_seconds, time_limit_seconds) = match &analysis.mode { + charmer_lsf::FailureMode::Timeout { + elapsed_seconds, + limit_seconds, + .. + } => (Some(*elapsed_seconds), Some(*limit_seconds)), + _ => (analysis.run_time_seconds, analysis.run_limit_seconds), + }; + + FailureAnalysis { + mode, + explanation: analysis.explanation.clone(), + suggestion: analysis.suggestion.clone(), + memory_used_mb, + memory_limit_mb, + runtime_seconds, + time_limit_seconds, + } +} + +/// Initialize polling service and return a handle to it. +pub async fn init_polling( + state: Arc>, + config: PollingConfig, +) -> Option> { + // Detect scheduler + let scheduler = detect_scheduler().await?; + + eprintln!( + "Detected scheduler: {:?}, polling every {} seconds", + scheduler, + config.active_poll_interval.as_secs() + ); + + // Create and start the polling service + let service = PollingService::new(state, config, scheduler); + Some(service.start()) +} diff --git a/crates/charmer/src/watcher.rs b/crates/charmer/src/watcher.rs new file mode 100644 index 0000000..122154d --- /dev/null +++ b/crates/charmer/src/watcher.rs @@ -0,0 +1,145 @@ +//! File watcher for snakemake metadata directory. + +use camino::{Utf8Path, Utf8PathBuf}; +use miette::{IntoDiagnostic, Result}; +use notify::{Config, Event, EventKind, RecommendedWatcher, RecursiveMode, Watcher}; +use std::sync::mpsc::{channel, Receiver, Sender}; +use std::time::Duration; + +/// Events from the file watcher. +#[derive(Debug, Clone)] +pub enum WatcherEvent { + /// A metadata file was created or modified + MetadataFile(Utf8PathBuf), + /// The metadata directory was created + MetadataDirectoryCreated, + /// Watcher error + Error(String), +} + +/// File watcher for the metadata directory. +pub struct MetadataWatcher { + _watcher: RecommendedWatcher, + receiver: Receiver, + metadata_dir: Utf8PathBuf, +} + +impl MetadataWatcher { + /// Create a new metadata watcher for the given working directory. + pub fn new(working_dir: &Utf8Path) -> Result { + let metadata_dir = working_dir.join(".snakemake").join("metadata"); + + let (tx, rx) = channel(); + + // Create watcher - pass working_dir to watch for .snakemake creation + let watcher = create_watcher(tx, metadata_dir.clone(), working_dir.to_owned())?; + + Ok(Self { + _watcher: watcher, + receiver: rx, + metadata_dir, + }) + } + + /// Get the metadata directory path. + #[allow(dead_code)] + pub fn metadata_dir(&self) -> &Utf8Path { + &self.metadata_dir + } + + /// Try to receive an event with a timeout. + #[allow(dead_code)] + pub fn try_recv(&self, timeout: Duration) -> Option { + self.receiver.recv_timeout(timeout).ok() + } + + /// Try to receive an event without blocking. + pub fn try_recv_nonblocking(&self) -> Option { + self.receiver.try_recv().ok() + } +} + +/// Create and configure the file watcher. +fn create_watcher( + tx: Sender, + metadata_dir: Utf8PathBuf, + working_dir: Utf8PathBuf, +) -> Result { + let tx_clone = tx.clone(); + let metadata_dir_clone = metadata_dir.clone(); + + let mut watcher = RecommendedWatcher::new( + move |res: notify::Result| match res { + Ok(event) => { + handle_event(event, &tx_clone, &metadata_dir_clone); + } + Err(e) => { + let _ = tx_clone.send(WatcherEvent::Error(e.to_string())); + } + }, + Config::default().with_poll_interval(Duration::from_millis(500)), + ) + .into_diagnostic()?; + + // Watch the metadata directory if it exists + if metadata_dir.exists() { + watcher + .watch(metadata_dir.as_std_path(), RecursiveMode::NonRecursive) + .into_diagnostic()?; + } + + // Also watch .snakemake directory to detect when metadata/ is created + let snakemake_dir = metadata_dir + .parent() + .unwrap_or_else(|| Utf8Path::new(".snakemake")); + if snakemake_dir.exists() { + // Ignore error if already watching + let _ = watcher.watch(snakemake_dir.as_std_path(), RecursiveMode::NonRecursive); + } + + // Always watch the working directory to detect .snakemake creation + let _ = watcher.watch(working_dir.as_std_path(), RecursiveMode::NonRecursive); + + Ok(watcher) +} + +/// Handle a file system event. +fn handle_event(event: Event, tx: &Sender, metadata_dir: &Utf8Path) { + match event.kind { + EventKind::Create(_) | EventKind::Modify(_) => { + for path in event.paths { + let path = match Utf8PathBuf::try_from(path) { + Ok(p) => p, + Err(_) => continue, + }; + + // Check if this is the metadata directory being created + if path == metadata_dir { + let _ = tx.send(WatcherEvent::MetadataDirectoryCreated); + continue; + } + + // Check if .snakemake or .snakemake/metadata was created + if path.file_name() == Some(".snakemake") || path.ends_with(".snakemake/metadata") { + let _ = tx.send(WatcherEvent::MetadataDirectoryCreated); + continue; + } + + // Check if this is a metadata file + if path.starts_with(metadata_dir) && path.is_file() { + // Skip hidden files + if let Some(name) = path.file_name() { + if name.starts_with('.') { + continue; + } + } + + let _ = tx.send(WatcherEvent::MetadataFile(path)); + } + } + } + _ => { + // Ignore other event types (access, remove, etc.) + } + } +} diff --git a/docs/api/index.md b/docs/api/index.md new file mode 100644 index 0000000..1a95801 --- /dev/null +++ b/docs/api/index.md @@ -0,0 +1,92 @@ +# API Reference + +Charmer is organized as a Rust workspace with multiple crates. + +## Crates + +### charmer-core + +Snakemake metadata parsing. + +```rust +use charmer_core::{scan_metadata_dir, SnakemakeJob, SnakemakeMetadata}; + +// Scan metadata directory +let jobs = scan_metadata_dir(working_dir)?; + +// Parse single file +let job = parse_metadata_file(path)?; +``` + +### charmer-slurm + +SLURM integration. + +```rust +use charmer_slurm::{query_squeue, query_sacct, SlurmJob}; + +// Query active jobs +let active = query_squeue(Some("run-uuid")).await?; + +// Query historical jobs +let history = query_sacct(Some("run-uuid"), Some(since)).await?; +``` + +### charmer-lsf + +LSF integration. + +```rust +use charmer_lsf::{query_bjobs, query_bhist, LsfJob}; + +// Query active jobs +let active = query_bjobs(Some("job-name")).await?; + +// Query historical jobs +let history = query_bhist(Some("job-name"), Some(since)).await?; +``` + +### charmer-state + +Unified state management. + +```rust +use charmer_state::{PipelineState, Job, JobStatus}; + +// Create state +let mut state = PipelineState::new(working_dir); + +// Merge data +merge_snakemake_jobs(&mut state, snakemake_jobs); +merge_slurm_jobs(&mut state, slurm_jobs, false); + +// Get counts +let counts = state.job_counts(); +``` + +### charmer-monitor + +TUI components. + +```rust +use charmer_monitor::App; + +// Create app +let app = App::new(state); + +// Handle events +app.handle_key(key_event); + +// Render +app.render(frame); +``` + +## Full API Documentation + +For complete API documentation, build the Rust docs: + +```bash +cargo doc --open +``` + +This will open the generated documentation in your browser. diff --git a/docs/dev/architecture.md b/docs/dev/architecture.md new file mode 100644 index 0000000..4831acb --- /dev/null +++ b/docs/dev/architecture.md @@ -0,0 +1,166 @@ +# Architecture + +Charmer is organized as a Rust workspace with multiple crates. + +## Crate Structure + +``` +charmer/ +├── crates/ +│ ├── charmer/ # Main binary +│ ├── charmer-cli/ # CLI argument parsing +│ ├── charmer-core/ # Snakemake metadata parsing +│ ├── charmer-slurm/ # SLURM integration +│ ├── charmer-lsf/ # LSF integration +│ ├── charmer-state/ # Unified job state +│ └── charmer-monitor/ # TUI components +``` + +## Crate Responsibilities + +### charmer + +The main binary. Handles: +- Terminal setup/teardown +- Main event loop +- Coordinating polling and rendering + +### charmer-cli + +CLI argument parsing using clap: +- `Args` struct with all command-line options +- Validation and defaults + +### charmer-core + +Snakemake metadata parsing: +- Base64 filename decoding +- JSON metadata parsing +- Directory scanning + +### charmer-slurm + +SLURM integration: +- `squeue` parsing for active jobs +- `sacct` parsing for historical jobs +- State mapping to unified types + +### charmer-lsf + +LSF integration: +- `bjobs` parsing for active jobs +- `bhist` parsing for historical jobs +- State mapping to unified types + +### charmer-state + +Unified state management: +- `PipelineState` - All jobs and metadata +- `Job` - Unified job representation +- Merge functions for combining data sources +- Job correlation logic + +### charmer-monitor + +TUI components using ratatui: +- `App` - Main application state and event handling +- `Header` - Progress bar +- `JobList` - Scrollable job list +- `JobDetail` - Selected job information +- `Footer` - Keyboard shortcuts +- `LogViewer` - Log file display + +## Data Flow + +``` +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Snakemake │ │ SLURM │ │ LSF │ +│ Metadata │ │ squeue/sacct│ │ bjobs/bhist │ +└──────┬──────┘ └──────┬──────┘ └──────┬──────┘ + │ │ │ + ▼ ▼ ▼ +┌──────────────────────────────────────────────────────┐ +│ charmer-state │ +│ │ +│ merge_snakemake_jobs() merge_slurm_jobs() │ +│ merge_lsf_jobs() │ +│ │ │ +│ ▼ │ +│ PipelineState │ +│ ┌─────────────┐ │ +│ │ Jobs │ │ +│ │ (HashMap) │ │ +│ └─────────────┘ │ +└──────────────────────────┬───────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────┐ +│ charmer-monitor │ +│ │ +│ Header ──────────────────────────────────────┐ │ +│ StatusBar ───────────────────────────────────│ │ +│ ┌─────────────────┬──────────────────────────┤ │ +│ │ JobList │ JobDetail │ │ +│ │ │ │ │ +│ └─────────────────┴──────────────────────────┘ │ +│ Footer ──────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────┘ +``` + +## Key Types + +### Job (charmer-state) + +```rust +pub struct Job { + pub id: String, + pub rule: String, + pub wildcards: Option, + pub outputs: Vec, + pub inputs: Vec, + pub status: JobStatus, + pub slurm_job_id: Option, + pub shellcmd: String, + pub timing: JobTiming, + pub resources: JobResources, + pub log_files: Vec, + pub error: Option, + pub data_sources: DataSources, +} +``` + +### JobStatus (charmer-state) + +```rust +pub enum JobStatus { + Pending, // Waiting for dependencies + Queued, // Submitted to scheduler + Running, // Currently executing + Completed, // Success + Failed, // Error + Cancelled, // User cancelled + Unknown, // Unknown state +} +``` + +## Event Loop + +```rust +loop { + // 1. Render UI + terminal.draw(|frame| app.render(frame))?; + + // 2. Handle input events (100ms timeout) + if app.poll_events(tick_rate)? { + // Key pressed, state updated + } + + // 3. Check for quit + if app.should_quit { + break; + } + + // 4. Background: poll scheduler, watch files + // (handled by separate tokio tasks) +} +``` diff --git a/docs/dev/contributing.md b/docs/dev/contributing.md new file mode 100644 index 0000000..646cc7b --- /dev/null +++ b/docs/dev/contributing.md @@ -0,0 +1,117 @@ +# Contributing + +Thank you for your interest in contributing to charmer! + +## Development Setup + +### Prerequisites + +- Rust 1.85+ +- [Pixi](https://pixi.sh) (optional, for managing dependencies) + +### Clone and Build + +```bash +git clone https://github.com/rnabioco/charmer.git +cd charmer + +# Using cargo +cargo build + +# Or using pixi +pixi install +pixi run build +``` + +### Running Tests + +```bash +cargo test +``` + +### Running Lints + +```bash +cargo fmt --check +cargo clippy -- -D warnings +``` + +## Code Style + +- Follow Rust conventions and idioms +- Use `cargo fmt` to format code +- Use `cargo clippy` to catch common issues +- Write tests for new functionality +- Document public APIs + +## Pull Request Process + +1. Fork the repository +2. Create a feature branch from `develop` +3. Make your changes +4. Run tests and lints +5. Submit a pull request to `develop` + +### Branch Naming + +- `feature/description` - New features +- `fix/description` - Bug fixes +- `docs/description` - Documentation changes + +### Commit Messages + +Use clear, descriptive commit messages: + +``` +Add log viewer component + +- Implement scrollable log display +- Add follow mode for real-time updates +- Handle missing log files gracefully +``` + +## Architecture + +See [Architecture](architecture.md) for an overview of the codebase. + +## Testing + +### Unit Tests + +Add tests in the same file as the code: + +```rust +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_something() { + // ... + } +} +``` + +### Integration Tests + +Add integration tests in `tests/`: + +```rust +// tests/integration_test.rs +use charmer_state::PipelineState; + +#[test] +fn test_pipeline_state() { + // ... +} +``` + +## Documentation + +- Update relevant docs when changing functionality +- Use rustdoc comments for public APIs +- Preview docs with `cargo doc --open` + +## Questions? + +Open an issue for questions or discussion. diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 0000000..3debe83 --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,79 @@ +# Installation + +## Requirements + +- **Rust 1.85+** (for building from source) +- Access to SLURM or LSF cluster commands +- A running or completed Snakemake pipeline + +## From Source + +### Using Cargo + +```bash +# Clone the repository +git clone https://github.com/rnabioco/charmer.git +cd charmer + +# Build release binary +cargo build --release + +# The binary will be at target/release/charmer +./target/release/charmer --help + +# Or install to ~/.cargo/bin +cargo install --path crates/charmer +``` + +### Using Pixi (Recommended for Development) + +[Pixi](https://pixi.sh) manages both Rust and Python dependencies: + +```bash +# Install pixi if you haven't already +curl -fsSL https://pixi.sh/install.sh | bash + +# Clone and install +git clone https://github.com/rnabioco/charmer.git +cd charmer +pixi install + +# Build +pixi run build + +# Run +pixi run charmer +``` + +## Pre-built Binaries + +Pre-built binaries are available for each release: + +- `charmer-linux-x86_64.tar.gz` - Linux (glibc) +- `charmer-linux-x86_64-musl.tar.gz` - Linux (musl, static) +- `charmer-macos-x86_64.tar.gz` - macOS Intel +- `charmer-macos-aarch64.tar.gz` - macOS Apple Silicon + +Download from the [releases page](https://github.com/rnabioco/charmer/releases). + +```bash +# Example for Linux +curl -LO https://github.com/rnabioco/charmer/releases/latest/download/charmer-linux-x86_64.tar.gz +tar xzf charmer-linux-x86_64.tar.gz +./charmer --help +``` + +## Verifying Installation + +```bash +# Check version +charmer --version + +# Show help +charmer --help +``` + +## Next Steps + +- [Quick Start Guide](quickstart.md) +- [Configuration Options](../guide/configuration.md) diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md new file mode 100644 index 0000000..b693067 --- /dev/null +++ b/docs/getting-started/quickstart.md @@ -0,0 +1,99 @@ +# Quick Start + +## Basic Usage + +Navigate to your Snakemake pipeline directory and run: + +```bash +cd /path/to/my/pipeline +charmer +``` + +The TUI will launch and display any jobs found in the `.snakemake/metadata/` directory and from scheduler queries. + +## Running Alongside Snakemake + +Start your pipeline in one terminal: + +```bash +snakemake --profile slurm -j 100 +``` + +In another terminal, start charmer: + +```bash +charmer +``` + +Charmer will automatically detect new jobs as Snakemake submits them. + +## Navigating the Interface + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ charmer Running 01:23:45 │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ ██████████████████████░░░░░░░░░░░░░░ 42/100 jobs 42% │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ 12 Pending │ 8 Running │ 38 Done │ 1 Failed │ Filter: All │ Sort: Status │ +├───────────────────────────────────┬─────────────────────────────────────────┤ +│ Jobs (3/54) │ Job Details │ +│ ──────────────────────────────────│─────────────────────────────────────────│ +│ ● align_reads[sample=S1] │ Rule: align_reads │ +│ ● align_reads[sample=S2] │ SLURM Job: 12345678 │ +│ > ✗ call_variants[chr=chr1] │ Node: node01 │ +│ ◐ merge_vcfs │ Status: Failed (exit 1) │ +│ ○ annotate_variants │ Runtime: 05:23 │ +│ │ CPUs: 4 | Memory: 32GB │ +├───────────────────────────────────┴─────────────────────────────────────────┤ +│ j/k:navigate f:filter s:sort ?:help q:quit │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### Status Symbols + +| Symbol | Status | +|--------|--------| +| `○` | Pending (waiting for dependencies) | +| `◐` | Queued (submitted to scheduler) | +| `●` | Running | +| `✓` | Completed | +| `✗` | Failed | +| `⊘` | Cancelled | + +### Keyboard Shortcuts + +| Key | Action | +|-----|--------| +| `j` / `↓` | Move selection down | +| `k` / `↑` | Move selection up | +| `g` | Go to first job | +| `G` | Go to last job | +| `f` | Cycle filter mode | +| `s` | Cycle sort mode | +| `?` | Show help | +| `q` | Quit | + +## Filtering Jobs + +Press `f` to cycle through filter modes: + +- **All** - Show all jobs +- **Running** - Only running jobs +- **Failed** - Only failed jobs +- **Pending** - Pending and queued jobs +- **Completed** - Successfully completed jobs + +## Sorting Jobs + +Press `s` to cycle through sort modes: + +- **Status** - Running first, then failed, queued, pending, completed +- **Rule** - Alphabetically by rule name +- **Time** - Most recently started first + +## Next Steps + +- [Configuration Options](../guide/configuration.md) +- [SLURM Integration](../guide/slurm.md) +- [LSF Integration](../guide/lsf.md) diff --git a/docs/guide/configuration.md b/docs/guide/configuration.md new file mode 100644 index 0000000..6bfe170 --- /dev/null +++ b/docs/guide/configuration.md @@ -0,0 +1,86 @@ +# Configuration + +## Command Line Options + +```bash +charmer [OPTIONS] [DIR] +``` + +### Arguments + +| Argument | Default | Description | +|----------|---------|-------------| +| `DIR` | `.` | Pipeline directory to monitor | + +### Options + +| Option | Default | Description | +|--------|---------|-------------| +| `--poll-interval ` | 5 | Seconds between scheduler queries | +| `--run-uuid ` | - | Filter to specific Snakemake run | +| `--theme ` | dark | Color theme (`dark` or `light`) | +| `--history-hours ` | 24 | Show completed jobs from last N hours | +| `-h, --help` | - | Print help information | +| `-V, --version` | - | Print version information | + +## Examples + +### Monitor Current Directory + +```bash +charmer +``` + +### Monitor Specific Directory + +```bash +charmer /path/to/pipeline +``` + +### Adjust Polling Frequency + +```bash +# Poll every 10 seconds (less frequent, lower overhead) +charmer --poll-interval 10 + +# Poll every 2 seconds (more frequent updates) +charmer --poll-interval 2 +``` + +### Filter to Specific Run + +If you have multiple pipelines running, filter by Snakemake run UUID: + +```bash +charmer --run-uuid abc123-def456 +``` + +### Use Light Theme + +```bash +charmer --theme light +``` + +### Show Longer History + +```bash +# Show completed jobs from last 48 hours +charmer --history-hours 48 +``` + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `USER` | Used for filtering scheduler queries to your jobs | +| `RUST_LOG` | Set to `debug` for verbose logging | + +## File Locations + +Charmer reads data from these locations: + +| Path | Description | +|------|-------------| +| `.snakemake/metadata/` | Snakemake job metadata files | +| `.snakemake/slurm_logs/` | SLURM job log files | +| `.snakemake/lsf_logs/` | LSF job log files | diff --git a/docs/guide/lsf.md b/docs/guide/lsf.md new file mode 100644 index 0000000..e941f80 --- /dev/null +++ b/docs/guide/lsf.md @@ -0,0 +1,90 @@ +# LSF Integration + +Charmer queries IBM Spectrum LSF to get real-time job status information. + +## How It Works + +Charmer uses two LSF commands: + +### bjobs (Active Jobs) + +Queries every `--poll-interval` seconds: + +```bash +bjobs -o "jobid stat queue submit_time start_time finish_time exec_host nprocs memlimit job_description delimiter='|'" -noheader +``` + +This retrieves: +- Job ID +- State +- Queue +- Submit/start/finish times +- Execution host +- Resources (processors, memory) +- Job description (contains rule info) + +### bhist (Job History) + +Queries every 30 seconds for completed jobs: + +```bash +bhist -a -l +``` + +## Snakemake LSF Executor + +Charmer works with the [snakemake-executor-plugin-lsf](https://github.com/snakemake/snakemake-executor-plugin-lsf). + +### Job Correlation + +The LSF executor should set the job description to: + +``` +rule_{rulename}_wildcards_{wildcards} +``` + +### Snakemake Profile + +Example profile for LSF (`~/.config/snakemake/lsf/config.yaml`): + +```yaml +executor: lsf + +default-resources: + lsf_queue: "short" + mem_mb: 4000 + runtime: 60 +``` + +## Job States + +| LSF State | Charmer Status | Description | +|-----------|----------------|-------------| +| PEND | Queued | Waiting in queue | +| RUN | Running | Currently executing | +| DONE | Completed | Finished successfully (exit 0) | +| EXIT | Failed | Exited with non-zero status | +| PSUSP | Pending | Suspended while pending | +| USUSP | Pending | Suspended by user | +| SSUSP | Pending | Suspended by system | +| ZOMBI | Unknown | Zombie job | + +## Queue Information + +Charmer displays the LSF queue as the "partition" in job details, matching the SLURM terminology for consistency. + +## Troubleshooting + +### Jobs Not Appearing + +1. Check that you're running charmer as the same user who submitted jobs +2. Verify bjobs works: `bjobs -a` +3. Check the `--run-uuid` filter if specified + +### Permission Issues + +On some clusters, bhist may require special permissions. Contact your system administrator if historical jobs don't appear. + +### Missing Job Description + +Ensure your Snakemake profile or job submission script sets the job description field for proper rule correlation. diff --git a/docs/guide/shortcuts.md b/docs/guide/shortcuts.md new file mode 100644 index 0000000..6caf2db --- /dev/null +++ b/docs/guide/shortcuts.md @@ -0,0 +1,62 @@ +# Keyboard Shortcuts + +## Navigation + +| Key | Action | +|-----|--------| +| `j` / `↓` | Move selection down | +| `k` / `↑` | Move selection up | +| `g` / `Home` | Go to first job | +| `G` / `End` | Go to last job | + +## Filtering & Sorting + +| Key | Action | +|-----|--------| +| `f` | Cycle filter (All → Running → Failed → Pending → Completed) | +| `s` | Cycle sort (Status → Rule → Time) | + +## Log Viewer + +| Key | Action | +|-----|--------| +| `l` / `Enter` | Open log viewer for selected job | +| `F` | Toggle follow mode (auto-scroll to end) | +| `q` / `Escape` | Close log viewer | + +When in log viewer: + +| Key | Action | +|-----|--------| +| `j` / `↓` | Scroll down | +| `k` / `↑` | Scroll up | +| `g` | Go to beginning of log | +| `G` | Go to end of log | +| `F` | Toggle follow mode | +| `q` | Return to job list | + +## General + +| Key | Action | +|-----|--------| +| `?` | Toggle help overlay | +| `q` / `Ctrl+C` | Quit charmer | +| `r` | Force refresh (re-query scheduler) | + +## Filter Modes + +| Mode | Shows | +|------|-------| +| All | All jobs | +| Running | Jobs currently executing | +| Failed | Jobs that exited with errors | +| Pending | Jobs waiting for dependencies or queued | +| Completed | Successfully finished jobs | + +## Sort Modes + +| Mode | Order | +|------|-------| +| Status | Running → Failed → Queued → Pending → Completed | +| Rule | Alphabetically by rule name | +| Time | Most recently started first | diff --git a/docs/guide/slurm.md b/docs/guide/slurm.md new file mode 100644 index 0000000..0ab7385 --- /dev/null +++ b/docs/guide/slurm.md @@ -0,0 +1,101 @@ +# SLURM Integration + +Charmer queries SLURM to get real-time job status information. + +## How It Works + +Charmer uses two SLURM commands: + +### squeue (Active Jobs) + +Queries every `--poll-interval` seconds: + +```bash +squeue -u $USER -h -o "%A|%j|%T|%P|%V|%S|%e|%N|%C|%m|%l|%k" +``` + +This retrieves: +- Job ID +- Job name (run UUID) +- State +- Partition +- Submit/start/end times +- Node list +- Resources (CPUs, memory, time limit) +- Comment field (contains rule info) + +### sacct (Job History) + +Queries every 30 seconds for completed jobs: + +```bash +sacct -X --parsable2 --noheader \ + --format=JobIDRaw,JobName,State,Partition,Submit,Start,End,NodeList,AllocCPUS,ReqMem,Timelimit,Comment,ExitCode \ + --starttime {since} +``` + +## Snakemake SLURM Executor + +Charmer is designed to work with the [snakemake-executor-plugin-slurm](https://github.com/snakemake/snakemake-executor-plugin-slurm). + +### Job Correlation + +The SLURM executor sets the comment field to: + +``` +rule_{rulename}_wildcards_{wildcards} +``` + +For example: +- `rule_align_reads_wildcards_sample=S1` +- `rule_call_variants_wildcards_sample=S1,chrom=chr1` + +Charmer parses this to correlate SLURM jobs with Snakemake rules. + +### Snakemake Profile + +Example profile for SLURM (`~/.config/snakemake/slurm/config.yaml`): + +```yaml +executor: slurm + +default-resources: + slurm_partition: "short" + mem_mb: 4000 + runtime: 60 + +# Enable comment field for charmer +set-resources: + __default__: + slurm_extra: "'--comment=rule_{rule}_wildcards_{wildcards}'" +``` + +## Job States + +| SLURM State | Charmer Status | Description | +|-------------|----------------|-------------| +| PENDING | Queued | Waiting in queue | +| RUNNING | Running | Currently executing | +| COMPLETED | Completed | Finished successfully | +| FAILED | Failed | Exited with error | +| CANCELLED | Cancelled | Cancelled by user | +| TIMEOUT | Failed | Exceeded time limit | +| OUT_OF_MEMORY | Failed | Exceeded memory limit | + +## Troubleshooting + +### Jobs Not Appearing + +1. Check that you're running charmer as the same user who submitted jobs +2. Verify squeue works: `squeue -u $USER` +3. Check the `--run-uuid` filter if specified + +### Missing Historical Jobs + +1. Increase `--history-hours` (default 24) +2. Verify sacct works: `sacct --starttime=now-24hours` +3. Check that SLURM accounting is enabled on your cluster + +### Comment Field Empty + +Ensure your Snakemake profile sets the comment field. See the profile example above. diff --git a/docs/images/demo.gif b/docs/images/demo.gif new file mode 100644 index 0000000..12d9d19 Binary files /dev/null and b/docs/images/demo.gif differ diff --git a/docs/images/quickstart.gif b/docs/images/quickstart.gif new file mode 100644 index 0000000..f527d88 Binary files /dev/null and b/docs/images/quickstart.gif differ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..5b091f5 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,76 @@ +# charmer + +A terminal user interface (TUI) for monitoring Snakemake pipelines running on HPC clusters. + +
+ +- :material-monitor: **Real-time Monitoring** + + Watch your pipeline jobs as they run on SLURM or LSF clusters. + +- :material-merge: **Unified View** + + Combines scheduler data with Snakemake metadata for complete visibility. + +- :material-keyboard: **Interactive TUI** + + Vim-style navigation, filtering, sorting, and log viewing. + +- :material-server: **Multi-Scheduler** + + Supports both SLURM and LSF cluster schedulers. + +
+ +## Demo + +![Charmer Demo](images/demo.gif) + +## Quick Start + +```bash +# Install +cargo install charmer + +# Run in your pipeline directory +cd /path/to/snakemake/pipeline +charmer +``` + +## Features + +### Job Monitoring + +- View all pipeline jobs with real-time status updates +- Filter by status: Running, Failed, Pending, Completed +- Sort by rule name, status, or start time +- See detailed job information including resources and timing + +### Log Viewing + +- View job log files directly in the TUI +- Follow mode for watching running jobs +- Scroll through historical output + +### Data Integration + +Charmer combines data from multiple sources: + +| Source | Data | +|--------|------| +| Snakemake metadata | Rule, inputs, outputs, shell command | +| squeue/bjobs | Active job status, resources | +| sacct/bhist | Historical data, exit codes | +| Log files | Job output, errors | + +## Requirements + +- Rust 1.85+ (for building from source) +- Access to SLURM or LSF cluster commands +- A running Snakemake pipeline + +## Next Steps + +- [Installation Guide](getting-started/installation.md) +- [Quick Start Tutorial](getting-started/quickstart.md) +- [Configuration Options](guide/configuration.md) diff --git a/docs/tapes/demo.tape b/docs/tapes/demo.tape new file mode 100644 index 0000000..5de1880 --- /dev/null +++ b/docs/tapes/demo.tape @@ -0,0 +1,108 @@ +# VHS Demo Tape for charmer +# Run from repo root: vhs docs/tapes/demo.tape + +Output docs/images/demo.gif + +Set Shell "bash" +Set FontSize 14 +Set Width 1200 +Set Height 700 +Set Theme "Catppuccin Mocha" +Set Padding 20 + +# Title screen +Type "# charmer - Snakemake Pipeline Monitor" +Enter +Sleep 1s + +# Show help briefly +Type "charmer --help" +Enter +Sleep 2s + +# Clear and start the TUI +Hide +Type "clear" +Enter +Show + +# Start charmer with demo pipeline +Type "charmer tests/pipelines/demo" +Enter + +# Watch jobs accumulate +Sleep 3s + +# Navigate down to see dependency chain highlighting +Type "j" +Sleep 400ms +Type "j" +Sleep 400ms +Type "j" +Sleep 400ms +Type "j" +Sleep 400ms +Type "j" +Sleep 1s + +# Watch more jobs complete +Sleep 2s + +# Go to bottom to see more jobs +Type "G" +Sleep 1.5s + +# Back to top +Type "g" +Sleep 1s + +# Switch to Rules view +Type@100ms "Tab" +Sleep 2s + +# Navigate rules +Type "j" +Sleep 500ms +Type "j" +Sleep 500ms +Type "j" +Sleep 1s + +# Back to Jobs view +Type@100ms "Tab" +Sleep 1s + +# Filter to running jobs only +Type "f" +Sleep 1.5s + +# Filter to completed +Type "f" +Sleep 1.5s + +# Back to all jobs +Type "f" +Sleep 500ms +Type "f" +Sleep 1s + +# Change sort order +Type "s" +Sleep 1s +Type "s" +Sleep 1s +Type "s" +Sleep 1s + +# Show help overlay +Type "?" +Sleep 2s +Type "?" +Sleep 500ms + +# Final view of accumulating jobs +Sleep 2s + +# Quit +Type "q" +Sleep 500ms diff --git a/docs/tapes/generate.sh b/docs/tapes/generate.sh new file mode 100755 index 0000000..29de00c --- /dev/null +++ b/docs/tapes/generate.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Change to repo root directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +cd "$ROOT_DIR" + +# Install charmer binary +cargo install --path crates/charmer + +# Clean and start the demo pipeline +pixi run clean-demo +snakemake --cores 4 --snakefile tests/pipelines/demo/Snakefile --directory tests/pipelines/demo & +PIPELINE_PID=$! + +# Wait for jobs to register +sleep 3 + +# Generate tapes from repo root +vhs docs/tapes/demo.tape +vhs docs/tapes/quickstart.tape + +# Cleanup +kill $PIPELINE_PID 2>/dev/null || true +wait $PIPELINE_PID 2>/dev/null || true + +echo "Tapes generated in docs/images/" diff --git a/docs/tapes/quickstart.tape b/docs/tapes/quickstart.tape new file mode 100644 index 0000000..6148c5a --- /dev/null +++ b/docs/tapes/quickstart.tape @@ -0,0 +1,46 @@ +# VHS Quickstart Tape for charmer +# Run from repo root: vhs docs/tapes/quickstart.tape + +Output docs/images/quickstart.gif + +Set Shell "bash" +Set FontSize 16 +Set Width 900 +Set Height 500 +Set Theme "Catppuccin Mocha" +Set Padding 20 + +Type "# Quick Start with charmer" +Enter +Sleep 1s + +Type "# Start monitoring" +Enter +Sleep 500ms + +Type "charmer tests/pipelines/demo" +Enter +Sleep 3s + +# Show basic navigation +Type "j" +Sleep 300ms +Type "j" +Sleep 300ms +Type "k" +Sleep 500ms + +# Filter to running jobs +Type "f" +Sleep 1s + +# Back to all +Type "f" +Type "f" +Type "f" +Type "f" +Sleep 500ms + +# Quit +Type "q" +Sleep 1s diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..906572e --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,70 @@ +site_name: charmer +site_description: TUI for monitoring Snakemake pipelines on HPC clusters +site_url: https://rnabioco.github.io/charmer +repo_url: https://github.com/rnabioco/charmer +repo_name: rnabioco/charmer + +theme: + name: material + palette: + - scheme: default + primary: deep purple + accent: purple + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + primary: deep purple + accent: purple + toggle: + icon: material/brightness-4 + name: Switch to light mode + features: + - navigation.instant + - navigation.tracking + - navigation.sections + - navigation.expand + - toc.follow + - content.code.copy + - content.code.annotate + +plugins: + - search + - mkdocstrings: + handlers: + python: + paths: [src] + +markdown_extensions: + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + - admonition + - pymdownx.details + - attr_list + - md_in_html + - tables + +nav: + - Home: index.md + - Getting Started: + - Installation: getting-started/installation.md + - Quick Start: getting-started/quickstart.md + - User Guide: + - Configuration: guide/configuration.md + - SLURM Integration: guide/slurm.md + - LSF Integration: guide/lsf.md + - Keyboard Shortcuts: guide/shortcuts.md + - Development: + - Architecture: dev/architecture.md + - Contributing: dev/contributing.md + - API Reference: api/index.md + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/rnabioco/charmer diff --git a/pixi.toml b/pixi.toml new file mode 100644 index 0000000..963ac3c --- /dev/null +++ b/pixi.toml @@ -0,0 +1,77 @@ +[workspace] +name = "charmer" +version = "0.1.0" +description = "Snakemake pipeline monitor TUI for SLURM clusters" +channels = ["conda-forge", "bioconda"] +platforms = ["osx-arm64", "osx-64", "linux-64"] + +[tasks] +# Run a test snakemake pipeline (local execution) +test-pipeline = "cd tests/pipelines/simple && snakemake --cores 2 --snakefile Snakefile" + +# Clean test pipeline outputs (including .snakemake metadata) +clean-test = "cd tests/pipelines/simple && rm -rf results logs .snakemake" + +# Clean demo pipeline outputs +clean-demo = "cd tests/pipelines/demo && rm -rf results logs .snakemake" + +# Dry run to see what would be executed +dry-run = "cd tests/pipelines/simple && snakemake --cores 2 --dry-run" + +# Generate DAG visualization +dag = "cd tests/pipelines/simple && snakemake --dag | dot -Tpng > dag.png" + +# Build the Rust project +build = "cargo build" + +# Build release binary +release = "cargo build --release" + +# Run tests +test = "cargo test" + +# Run charmer in development mode +run = "cargo run -- ." + +# Monitor the test pipeline +monitor-test = "cargo run -- tests/pipelines/simple" + +# Format code +fmt = "cargo fmt" + +# Run clippy lints +lint = "cargo clippy -- -D warnings" + +# Build documentation +docs = "mkdocs build" + +# Serve documentation locally +docs-serve = "mkdocs serve" + +# Deploy documentation to GitHub Pages +docs-deploy = "mkdocs gh-deploy --force" + +# Generate Rust API docs +rustdoc = "cargo doc --no-deps --open" + +# Generate VHS demo +demo = "cd docs/tapes && ./generate.sh" + +[dependencies] +# Snakemake and execution plugins +snakemake = ">=8.0" +snakemake-executor-plugin-slurm = "*" +snakemake-executor-plugin-cluster-generic = "*" + +# For DAG visualization +graphviz = "*" + +# Python for any helper scripts +python = ">=3.11" + +# Documentation +mkdocs = "*" +mkdocs-material = "*" + +# VHS for terminal recording +vhs = "*" diff --git a/tests/pipelines/cluster/.gitignore b/tests/pipelines/cluster/.gitignore new file mode 100644 index 0000000..6bf305c --- /dev/null +++ b/tests/pipelines/cluster/.gitignore @@ -0,0 +1,9 @@ +# Snakemake outputs +results/ +logs/ +.snakemake/ + +# Cluster job outputs +slurm-*.out +*.o* +*.e* diff --git a/tests/pipelines/cluster/Snakefile b/tests/pipelines/cluster/Snakefile new file mode 100644 index 0000000..58eaa61 --- /dev/null +++ b/tests/pipelines/cluster/Snakefile @@ -0,0 +1,277 @@ +# Cluster test pipeline for charmer development +# Designed to create many pending jobs and demonstrate failure handling +# +# DAG Structure (~76 jobs): +# process_sample (12) -> align_sample (12) -> call_variants (48) +# | +# merge_sample_variants (12) +# | +# group_analysis (2) +# | +# final_merge (1) +# | +# generate_report (1) + +configfile: "config/config.yaml" + +# Load samples configuration +import yaml +with open(config["samples_file"]) as f: + samples_config = yaml.safe_load(f) + +SAMPLES = list(samples_config["samples"].keys()) +CHROMOSOMES = config["chromosomes"] +GROUPS = config["groups"] +DELAYS = config["delays"] + +# Helper to get sample metadata +def get_sample_meta(sample, key, default=None): + return samples_config["samples"].get(sample, {}).get(key, default) + +# Get samples that should succeed (no fail_at defined) +SUCCESSFUL_SAMPLES = [s for s in SAMPLES if get_sample_meta(s, "fail_at") is None] + +# Get samples by group (excluding failure group) +def get_samples_by_group(group): + return [s for s in SAMPLES + if get_sample_meta(s, "group") == group + and get_sample_meta(s, "fail_at") is None] + +rule all: + input: + "results/final_report.txt" + +rule process_sample: + """Initial sample processing - first step for all samples.""" + output: + "results/processed/{sample}.txt" + params: + delay=DELAYS["process"], + fail_at=lambda wc: get_sample_meta(wc.sample, "fail_at"), + exit_code=lambda wc: get_sample_meta(wc.sample, "exit_code", 1) + threads: 1 + resources: + mem_mb=2000, + runtime=10 + log: + "logs/process_sample/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting processing for {wildcards.sample}..." + echo "INFO: Initializing sample processing pipeline" + echo "INFO: Checking input data integrity" + + # Check if this sample should fail at this stage + if [ "{params.fail_at}" = "process" ]; then + echo "ERROR: Data validation failed for {wildcards.sample}" + echo "ERROR: Input file appears corrupted or malformed" + echo "ERROR: Cannot proceed with processing" + sleep 5 + exit {params.exit_code} + fi + + sleep {params.delay} + echo "INFO: Sample {wildcards.sample} processed successfully" + echo "Processed: {wildcards.sample}" > {output} + """ + +rule align_sample: + """Alignment step - depends on processing.""" + input: + "results/processed/{sample}.txt" + output: + "results/aligned/{sample}.bam" + params: + delay=DELAYS["align"], + fail_at=lambda wc: get_sample_meta(wc.sample, "fail_at"), + exit_code=lambda wc: get_sample_meta(wc.sample, "exit_code", 1) + threads: 2 + resources: + mem_mb=4000, + runtime=15 + log: + "logs/align_sample/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting alignment for {wildcards.sample}..." + echo "INFO: Loading reference genome" + sleep 5 + echo "INFO: Indexing reference" + sleep 3 + echo "INFO: Running aligner on {wildcards.sample}" + + # Check if this sample should fail at this stage + if [ "{params.fail_at}" = "align" ]; then + echo "ERROR: Out of memory while aligning {wildcards.sample}" + echo "ERROR: Process killed by OOM killer" + echo "ERROR: Peak memory usage: 32.5 GB (limit: 16 GB)" + echo "ERROR: Consider increasing memory allocation" + sleep 2 + exit {params.exit_code} + fi + + sleep {params.delay} + echo "INFO: Alignment complete for {wildcards.sample}" + echo "INFO: Mapped reads: 45,234,567" + echo "INFO: Unmapped reads: 1,234,567" + echo "Aligned: {wildcards.sample}" > {output} + """ + +rule call_variants: + """Variant calling per chromosome - creates fan-out pattern.""" + input: + bam="results/aligned/{sample}.bam" + output: + vcf="results/variants/{sample}_{chrom}.vcf" + params: + delay=DELAYS["variants"], + fail_at=lambda wc: get_sample_meta(wc.sample, "fail_at"), + exit_code=lambda wc: get_sample_meta(wc.sample, "exit_code", 1) + threads: 2 + resources: + mem_mb=4000, + runtime=20 + log: + "logs/call_variants/{sample}_{chrom}.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting variant calling for {wildcards.sample} on {wildcards.chrom}..." + echo "INFO: Loading BAM file" + sleep 3 + echo "INFO: Scanning {wildcards.chrom} for variants" + + # Check if this sample should fail at this stage + if [ "{params.fail_at}" = "call_variants" ]; then + echo "ERROR: Variant calling timed out for {wildcards.sample} on {wildcards.chrom}" + echo "ERROR: Wall clock limit exceeded" + echo "ERROR: Job ran for 3600 seconds (limit: 1800 seconds)" + sleep 2 + exit {params.exit_code} + fi + + sleep {params.delay} + echo "INFO: Found 12,345 variants on {wildcards.chrom}" + echo "INFO: SNPs: 10,234 Indels: 2,111" + echo "Variants: {wildcards.sample} {wildcards.chrom}" > {output.vcf} + """ + +rule merge_sample_variants: + """Merge variants per sample across chromosomes - fan-in pattern.""" + input: + expand("results/variants/{{sample}}_{chrom}.vcf", chrom=CHROMOSOMES) + output: + "results/merged/{sample}_merged.vcf" + params: + delay=DELAYS["merge"], + fail_at=lambda wc: get_sample_meta(wc.sample, "fail_at"), + exit_code=lambda wc: get_sample_meta(wc.sample, "exit_code", 1) + threads: 1 + resources: + mem_mb=2000, + runtime=10 + log: + "logs/merge_sample/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting variant merge for {wildcards.sample}..." + echo "INFO: Merging VCF files from {CHROMOSOMES}" + + # Check if this sample should fail at this stage + if [ "{params.fail_at}" = "merge" ]; then + echo "ERROR: Segmentation fault while merging {wildcards.sample}" + echo "ERROR: Memory corruption detected at 0x7fff5fbff8c0" + echo "ERROR: Stack trace:" + echo "ERROR: #0 merge_vcf_records() at vcfmerge.c:234" + echo "ERROR: #1 main() at vcfmerge.c:567" + sleep 2 + exit {params.exit_code} + fi + + sleep {params.delay} + cat {input} > {output} + echo "INFO: Merge complete for {wildcards.sample}" + echo "INFO: Total variants: 49,380" + """ + +rule group_analysis: + """Per-group analysis - aggregates samples by group.""" + input: + lambda wc: expand("results/merged/{sample}_merged.vcf", + sample=get_samples_by_group(wc.group)) + output: + "results/groups/{group}_analysis.txt" + params: + delay=DELAYS["group_analysis"] + threads: 2 + resources: + mem_mb=4000, + runtime=15 + log: + "logs/group_analysis/{group}.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting group analysis for {wildcards.group}..." + echo "INFO: Aggregating samples in group" + echo "INFO: Input samples: {input}" + sleep {params.delay} + echo "Group {wildcards.group} analysis complete" > {output} + echo "INFO: Analysis complete for {wildcards.group}" + """ + +rule final_merge: + """Final merge of all group analyses.""" + input: + expand("results/groups/{group}_analysis.txt", group=GROUPS) + output: + "results/final_analysis.txt" + params: + delay=DELAYS["final_merge"] + threads: 1 + resources: + mem_mb=2000, + runtime=10 + log: + "logs/final_merge.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting final merge of all groups..." + echo "INFO: Combining group analyses" + sleep {params.delay} + cat {input} > {output} + echo "INFO: Final merge complete" + """ + +rule generate_report: + """Generate final pipeline report.""" + input: + analysis="results/final_analysis.txt", + merged=expand("results/merged/{sample}_merged.vcf", sample=SUCCESSFUL_SAMPLES) + output: + "results/final_report.txt" + params: + delay=DELAYS["report"] + threads: 1 + resources: + mem_mb=1000, + runtime=5 + log: + "logs/generate_report.log" + shell: + """ + exec > {log} 2>&1 + echo "Generating final report..." + echo "INFO: Collecting pipeline statistics" + sleep {params.delay} + echo "=== Pipeline Complete ===" > {output} + echo "Successful samples: {SUCCESSFUL_SAMPLES}" >> {output} + echo "Groups analyzed: {GROUPS}" >> {output} + echo "Chromosomes: {CHROMOSOMES}" >> {output} + date >> {output} + echo "INFO: Report generated successfully" + """ diff --git a/tests/pipelines/cluster/cluster/lsf/config.yaml b/tests/pipelines/cluster/cluster/lsf/config.yaml new file mode 100644 index 0000000..6b67c5b --- /dev/null +++ b/tests/pipelines/cluster/cluster/lsf/config.yaml @@ -0,0 +1,48 @@ +# Snakemake LSF Executor Profile Configuration +# ============================================= +# This profile configures Snakemake to submit jobs to LSF clusters +# Usage: snakemake --profile cluster/lsf + +# Executor settings +executor: lsf +jobs: 50 # Maximum concurrent jobs +latency-wait: 15 # Wait time for output files (seconds) + +# Default resources for all rules +default-resources: + - "runtime=30" + - "mem_mb=4000" + - "lsf_queue=rna" + - "lsf_project=charmer-test" + - 'lsf_extra=""' + +# Rule-specific resource overrides +set-resources: + - process_sample:runtime=10 + - process_sample:mem_mb=2000 + + - align_sample:runtime=15 + - align_sample:mem_mb=4000 + + - call_variants:runtime=20 + - call_variants:mem_mb=4000 + + - merge_sample_variants:runtime=10 + - merge_sample_variants:mem_mb=2000 + + - group_analysis:runtime=15 + - group_analysis:mem_mb=4000 + + - final_merge:runtime=10 + - final_merge:mem_mb=2000 + + - generate_report:runtime=5 + - generate_report:mem_mb=1000 + +# Behavior settings +rerun-incomplete: true +keep-going: true + +# Output settings +printshellcmds: true +show-failed-logs: true diff --git a/tests/pipelines/cluster/cluster/slurm/config.yaml b/tests/pipelines/cluster/cluster/slurm/config.yaml new file mode 100644 index 0000000..280c7da --- /dev/null +++ b/tests/pipelines/cluster/cluster/slurm/config.yaml @@ -0,0 +1,63 @@ +# Snakemake SLURM Executor Profile Configuration +# ================================================ +# This profile configures Snakemake to submit jobs to SLURM clusters +# Usage: snakemake --profile cluster/slurm + +# Executor settings +executor: slurm +jobs: 50 # Maximum concurrent jobs +latency-wait: 60 # Wait time for output files (seconds) + +# Default resources for all rules +default-resources: + slurm_partition: "amilan" # Default CPU partition (Alpine) + slurm_account: "amc-general" # Alpine allocation account + slurm_qos: "normal" # Default QoS + runtime: 30 # Default walltime in minutes + mem_mb: 4000 # Default memory in MB (4GB) + cpus_per_task: 1 # Default CPUs per task + +# Rule-specific resource overrides +set-resources: + process_sample: + runtime: 10 + mem_mb: 2000 + cpus_per_task: 1 + + align_sample: + runtime: 15 + mem_mb: 4000 + cpus_per_task: 2 + + call_variants: + runtime: 20 + mem_mb: 4000 + cpus_per_task: 2 + + merge_sample_variants: + runtime: 10 + mem_mb: 2000 + cpus_per_task: 1 + + group_analysis: + runtime: 15 + mem_mb: 4000 + cpus_per_task: 2 + + final_merge: + runtime: 10 + mem_mb: 2000 + cpus_per_task: 1 + + generate_report: + runtime: 5 + mem_mb: 1000 + cpus_per_task: 1 + +# Behavior settings +rerun-incomplete: true +keep-going: true + +# Output settings +printshellcmds: true +show-failed-logs: true diff --git a/tests/pipelines/cluster/config/config.yaml b/tests/pipelines/cluster/config/config.yaml new file mode 100644 index 0000000..760f9f5 --- /dev/null +++ b/tests/pipelines/cluster/config/config.yaml @@ -0,0 +1,27 @@ +# Pipeline configuration for cluster test pipeline + +project: "charmer-test" +samples_file: "config/samples.yaml" +output_dir: "results" + +# Chromosomes for variant calling (creates fan-out) +chromosomes: + - chr1 + - chr2 + - chr3 + - chr4 + +# Groups for analysis +groups: + - control + - treatment + +# Job timing (seconds) - short for testing +delays: + process: 30 + align: 60 + variants: 45 + merge: 20 + group_analysis: 30 + final_merge: 15 + report: 10 diff --git a/tests/pipelines/cluster/config/samples.yaml b/tests/pipelines/cluster/config/samples.yaml new file mode 100644 index 0000000..8f866d3 --- /dev/null +++ b/tests/pipelines/cluster/config/samples.yaml @@ -0,0 +1,62 @@ +# Sample configuration for cluster test pipeline +# 12 samples: 8 successful + 4 designed to fail at different stages + +samples: + # Control group - 4 samples + ctrl-rep1: + group: control + replicate: 1 + + ctrl-rep2: + group: control + replicate: 2 + + ctrl-rep3: + group: control + replicate: 3 + + ctrl-rep4: + group: control + replicate: 4 + + # Treatment group - 4 samples + treat-rep1: + group: treatment + replicate: 1 + + treat-rep2: + group: treatment + replicate: 2 + + treat-rep3: + group: treatment + replicate: 3 + + treat-rep4: + group: treatment + replicate: 4 + + # Samples designed to FAIL with different errors + fail-oom: + group: failure + replicate: 1 + fail_at: align # Fails during alignment + exit_code: 137 # OOM kill signal + + fail-timeout: + group: failure + replicate: 2 + fail_at: call_variants # Fails during variant calling + exit_code: 124 # Timeout signal + + fail-error: + group: failure + replicate: 3 + fail_at: process # Fails during initial processing + exit_code: 1 # Generic error + + fail-segfault: + group: failure + replicate: 4 + fail_at: merge # Fails during merge + exit_code: 139 # Segfault signal diff --git a/tests/pipelines/demo/.gitignore b/tests/pipelines/demo/.gitignore new file mode 100644 index 0000000..2bd4b1a --- /dev/null +++ b/tests/pipelines/demo/.gitignore @@ -0,0 +1,3 @@ +results/ +logs/ +.snakemake/ diff --git a/tests/pipelines/demo/Snakefile b/tests/pipelines/demo/Snakefile new file mode 100644 index 0000000..1bf679b --- /dev/null +++ b/tests/pipelines/demo/Snakefile @@ -0,0 +1,154 @@ +# Demo pipeline optimized for charmer VHS recordings +# Designed to show: job accumulation, dependency chains, multiple rules + +SAMPLES = [f"sample{i}" for i in range(1, 13)] # 12 samples +REGIONS = ["region_A", "region_B", "region_C"] + +rule all: + input: + "results/final_summary.txt" + +rule download: + """Download raw data - fast, runs first for all samples.""" + output: + "results/raw/{sample}.fastq" + params: + delay = lambda wc: 2 + (hash(wc.sample) % 3) # 2-4 seconds, varies by sample + log: + "logs/download/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Downloading {wildcards.sample}..." + sleep {params.delay} + echo "Downloaded" > {output} + """ + +rule quality_check: + """QC step - depends on download.""" + input: + "results/raw/{sample}.fastq" + output: + "results/qc/{sample}_qc.txt" + params: + delay = lambda wc: 3 + (hash(wc.sample) % 4) # 3-6 seconds + log: + "logs/qc/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Running QC on {wildcards.sample}..." + sleep {params.delay} + echo "QC passed" > {output} + """ + +rule align: + """Alignment - depends on QC, longer running.""" + input: + fastq = "results/raw/{sample}.fastq", + qc = "results/qc/{sample}_qc.txt" + output: + "results/aligned/{sample}.bam" + params: + delay = lambda wc: 5 + (hash(wc.sample) % 5) # 5-9 seconds + log: + "logs/align/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Aligning {wildcards.sample}..." + sleep {params.delay} + echo "Aligned" > {output} + """ + +rule call_peaks: + """Peak calling per region - creates fan-out from alignment.""" + input: + "results/aligned/{sample}.bam" + output: + "results/peaks/{sample}_{region}.bed" + params: + delay = lambda wc: 4 + (hash(wc.sample + wc.region) % 4) # 4-7 seconds + log: + "logs/peaks/{sample}_{region}.log" + shell: + """ + exec > {log} 2>&1 + echo "Calling peaks for {wildcards.sample} in {wildcards.region}..." + sleep {params.delay} + echo "Peaks found" > {output} + """ + +rule merge_peaks: + """Merge peaks per sample - depends on all regions.""" + input: + expand("results/peaks/{{sample}}_{region}.bed", region=REGIONS) + output: + "results/merged/{sample}_peaks.bed" + params: + delay = 3 + log: + "logs/merge/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Merging peaks for {wildcards.sample}..." + sleep {params.delay} + cat {input} > {output} + """ + +rule annotate: + """Annotate merged peaks.""" + input: + "results/merged/{sample}_peaks.bed" + output: + "results/annotated/{sample}_annotated.txt" + params: + delay = 4 + log: + "logs/annotate/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Annotating {wildcards.sample}..." + sleep {params.delay} + echo "Annotated" > {output} + """ + +rule sample_report: + """Per-sample report - end of chain per sample.""" + input: + annotated = "results/annotated/{sample}_annotated.txt", + bam = "results/aligned/{sample}.bam" + output: + "results/reports/{sample}_report.txt" + params: + delay = 2 + log: + "logs/report/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Generating report for {wildcards.sample}..." + sleep {params.delay} + echo "Report for {wildcards.sample}" > {output} + """ + +rule final_summary: + """Final summary - depends on all sample reports.""" + input: + expand("results/reports/{sample}_report.txt", sample=SAMPLES) + output: + "results/final_summary.txt" + params: + delay = 3 + log: + "logs/final_summary.log" + shell: + """ + exec > {log} 2>&1 + echo "Creating final summary..." + sleep {params.delay} + echo "Pipeline complete" > {output} + cat {input} >> {output} + """ diff --git a/tests/pipelines/simple/.gitignore b/tests/pipelines/simple/.gitignore new file mode 100644 index 0000000..14db849 --- /dev/null +++ b/tests/pipelines/simple/.gitignore @@ -0,0 +1,5 @@ +# Snakemake outputs +results/ +.snakemake/ +logs/ +dag.png diff --git a/tests/pipelines/simple/Snakefile b/tests/pipelines/simple/Snakefile new file mode 100644 index 0000000..53a6c33 --- /dev/null +++ b/tests/pipelines/simple/Snakefile @@ -0,0 +1,151 @@ +# Simple test pipeline for charmer development +# Simulates a basic bioinformatics workflow (~4-5 minutes with 2 cores) + +SAMPLES = ["sample1", "sample2", "sample3", "sample4", "sample5", "sample6"] +CHROMOSOMES = ["chr1", "chr2", "chr3", "chr4"] + +# sample3 will fail during alignment to test failure display +FAILING_SAMPLE = "sample3" + +rule all: + input: + "results/final_report.txt" + +rule process_sample: + """Initial sample processing - runs first for all samples.""" + output: + "results/processed/{sample}.txt" + params: + delay = 10 # Doubled from 5 + log: + "logs/process_sample/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting processing for {wildcards.sample}..." + echo "INFO: Initializing sample processing pipeline" + sleep {params.delay} + echo "INFO: Sample {wildcards.sample} processed successfully" + echo "Processed: {wildcards.sample}" > {output} + """ + +rule align_sample: + """Alignment step - depends on processing. sample3 will fail.""" + input: + "results/processed/{sample}.txt" + output: + "results/aligned/{sample}.bam" + params: + delay = 16, # Doubled from 8 + failing_sample = FAILING_SAMPLE + log: + "logs/align_sample/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting alignment for {wildcards.sample}..." + echo "INFO: Loading reference genome" + sleep 5 + echo "INFO: Running aligner on {wildcards.sample}" + + # Simulate failure for sample3 + if [ "{wildcards.sample}" = "{params.failing_sample}" ]; then + echo "ERROR: Out of memory while aligning {wildcards.sample}" + echo "ERROR: Process killed by OOM killer" + echo "ERROR: Peak memory usage: 32.5 GB (limit: 16 GB)" + sleep 2 + exit 137 # Simulates OOM kill + fi + + sleep {params.delay} + echo "INFO: Alignment complete for {wildcards.sample}" + echo "Aligned: {wildcards.sample}" > {output} + """ + +rule call_variants: + """Variant calling per chromosome - depends on alignment.""" + input: + bam = "results/aligned/{sample}.bam" + output: + vcf = "results/variants/{sample}_{chrom}.vcf" + params: + delay = 12 # Doubled from 6 + log: + "logs/call_variants/{sample}_{chrom}.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting variant calling for {wildcards.sample} on {wildcards.chrom}..." + echo "INFO: Loading BAM file" + sleep 3 + echo "INFO: Scanning {wildcards.chrom} for variants" + sleep {params.delay} + echo "INFO: Found variants on {wildcards.chrom}" + echo "Variants: {wildcards.sample} {wildcards.chrom}" > {output.vcf} + """ + +rule merge_sample_variants: + """Merge variants per sample across chromosomes.""" + input: + expand("results/variants/{{sample}}_{chrom}.vcf", chrom=CHROMOSOMES) + output: + "results/merged/{sample}_merged.vcf" + params: + delay = 8 # Doubled from 4 + log: + "logs/merge_sample/{sample}.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting variant merge for {wildcards.sample}..." + echo "INFO: Merging {CHROMOSOMES} VCF files" + sleep {params.delay} + cat {input} > {output} + echo "INFO: Merge complete for {wildcards.sample}" + """ + +rule final_merge: + """Final merge of all sample variants.""" + input: + expand("results/merged/{sample}_merged.vcf", sample=[s for s in SAMPLES if s != FAILING_SAMPLE]) + output: + "results/all_variants.vcf" + params: + delay = 10 # Doubled from 5 + log: + "logs/final_merge.log" + shell: + """ + exec > {log} 2>&1 + echo "Starting final merge of all variants..." + echo "INFO: Combining all sample VCFs" + sleep {params.delay} + cat {input} > {output} + echo "INFO: Final merge complete" + """ + +rule generate_report: + """Generate final pipeline report.""" + input: + vcf = "results/all_variants.vcf", + bams = expand("results/aligned/{sample}.bam", sample=[s for s in SAMPLES if s != FAILING_SAMPLE]) + output: + "results/final_report.txt" + params: + delay = 6 # Doubled from 3 + log: + "logs/generate_report.log" + shell: + """ + exec > {log} 2>&1 + echo "Generating final report..." + echo "INFO: Collecting pipeline statistics" + sleep {params.delay} + echo "=== Pipeline Complete ===" > {output} + echo "Samples processed: {SAMPLES}" >> {output} + echo "Note: {FAILING_SAMPLE} failed during alignment" >> {output} + echo "Chromosomes: {CHROMOSOMES}" >> {output} + echo "Total jobs: $(find results -type f | wc -l)" >> {output} + date >> {output} + echo "INFO: Report generated successfully" + """ diff --git a/tests/pipelines/simple/profile/config.yaml b/tests/pipelines/simple/profile/config.yaml new file mode 100644 index 0000000..f3caf79 --- /dev/null +++ b/tests/pipelines/simple/profile/config.yaml @@ -0,0 +1,22 @@ +# SLURM profile for testing charmer +# Use: snakemake --profile profile/ + +executor: slurm + +# Default resources for all rules +default-resources: + slurm_partition: "short" + mem_mb: 1000 + runtime: 10 # minutes + +# Job grouping +group-components: 1 + +# Retries +retries: 1 + +# Keep going on failures for testing +keep-going: true + +# Local cores for local rules +local-cores: 1