refactoring cli app

pnrobinson · pnrobinson · commit e965aa22d588 · 2026-01-13T15:09:33.000+01:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ga4ghphetools"
-version = "0.5.17"
+version = "0.5.19"
 edition = "2021"
 keywords = ["GA4GH", "Phenopacket Schema", "Human Phenotype Ontology"]
 description = "Generate GA4GH phenopackets from tabular data"
diff --git a/bin/commands/compare.rs b/bin/commands/compare.rs
@@ -0,0 +1,26 @@
+use clap::{ArgMatches};
+
+pub fn command() -> clap::Command {
+    clap::Command::new("compare")
+        .about("Compare two cohorts and export to Excel")
+        .arg(clap::Arg::new("cohort1").long("cohort1").required(true))
+        .arg(clap::Arg::new("cohort2").long("cohort2").required(true))
+        .arg(clap::Arg::new("output").long("output").required(true))
+        .arg(clap::Arg::new("hpo").long("hpo").required(true))
+        .arg(
+            clap::Arg::new("threshold")
+                .long("threshold")
+                .default_value("1"),
+        )
+}
+
+#[cfg(feature = "excel_export")]
+pub fn handle(sub_matches: &ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
+    crate::commands::excel::handle(sub_matches)
+}
+
+#[cfg(not(feature = "excel_export"))]
+pub fn handle(_sub_matches: &ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
+    eprintln!("This binary was built without the `excel_export` feature");
+    Ok(())
+}
diff --git a/bin/commands/etl.rs b/bin/commands/etl.rs
@@ -0,0 +1,29 @@
+use clap::{Arg, ArgMatches};
+use ga4ghphetools::dto::etl_dto::EtlDto;
+
+/// Returns the `clap::Command` for ETL
+pub fn command() -> clap::Command {
+    clap::Command::new("etl")
+        .about("Test converting an EtlDto to CohortData")
+        .arg(Arg::new("input").short('i').long("input").required(true))
+        .arg(Arg::new("hpo").short('o').long("hpo").required(true))
+}
+
+/// Handler for the subcommand
+pub fn handle(sub_matches: &ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
+    let input = sub_matches.get_one::<String>("input").unwrap();
+    let hpo_path = sub_matches.get_one::<String>("hpo").unwrap();
+    let hpo = crate::load_hpo(hpo_path)?;
+
+    let contents = std::fs::read_to_string(input)
+        .map_err(|e| format!("Failed to read file: {}", e))?;
+
+    let dto: EtlDto = serde_json::from_str(&contents)
+        .map_err(|e| format!("Failed to deserialize JSON: {}", e))?;
+
+    let cohort = ga4ghphetools::etl::get_cohort_data_from_etl_dto(hpo.clone(), dto)?;
+    let json = serde_json::to_string_pretty(&cohort)?;
+    println!("{}", json);
+
+    Ok(())
+}
diff --git a/bin/commands/excel.rs b/bin/commands/excel.rs
@@ -0,0 +1,39 @@
+use clap::{Arg, ArgMatches};
+
+
+#[cfg(feature = "excel_export")]
+use ga4ghphetools::export::output_excel_comparison;
+
+/// Returns the `clap::Command` for this subcommand
+pub fn command() -> clap::Command {
+    clap::Command::new("excel")
+        .about("Compare two cohorts and export to Excel")
+        .arg(Arg::new("cohort1").long("cohort1").required(true))
+        .arg(Arg::new("cohort2").long("cohort2").required(true))
+        .arg(Arg::new("output").long("output").required(true))
+        .arg(Arg::new("hpo").long("hpo").required(true))
+        .arg(
+            Arg::new("threshold")
+                .long("threshold")
+                .default_value("1"),
+        )
+}
+
+/// Handler for the subcommand
+#[cfg(feature = "excel_export")]
+pub fn handle(sub_matches: &ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
+    let cohort_1 = sub_matches.get_one::<String>("cohort1").unwrap();
+    let cohort_2 = sub_matches.get_one::<String>("cohort2").unwrap();
+    let output = sub_matches.get_one::<String>("output").unwrap();
+    let hpo_path = sub_matches.get_one::<String>("hpo").unwrap();
+    let threshold: usize = sub_matches.get_one::<String>("threshold").unwrap().parse()?;
+    let hpo = crate::load_hpo(hpo_path)?;
+
+    output_excel_comparison(cohort_1, cohort_2, output, hpo, threshold).map_err(|e| e.into())
+}
+
+#[cfg(not(feature = "excel_export"))]
+pub fn handle(_sub_matches: &ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
+    eprintln!("This binary was built without the `excel_export` feature");
+    Ok(())
+}
diff --git a/bin/commands/json.rs b/bin/commands/json.rs
@@ -0,0 +1,25 @@
+use crate::commands::util::extract_file_name;
+
+use clap::ArgMatches;
+
+pub fn command() -> clap::Command {
+    clap::Command::new("json")
+        .about("Q/C Cohort JSON file")
+        .arg(clap::Arg::new("cohort").short('c').long("cohort"))
+        .arg(clap::Arg::new("hpo").short('o').long("hpo").required(true))
+}
+
+pub fn handle(sub_matches: &ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
+    let json_input_path = sub_matches.get_one::<String>("input").expect("Could not read JSON input");
+    let hpo_path = sub_matches.get_one::<String>("hpo").expect("Could not retrieve hp.json path");
+    let hpo = crate::load_hpo(hpo_path).expect("Could not construct HPO ontology");
+    let cohort = ga4ghphetools::factory::load_json_cohort(json_input_path).expect("Could not load Cohort JSON file");
+    let cohort_file_name = extract_file_name(json_input_path);
+    match ga4ghphetools::factory::qc_assessment(hpo, &cohort) {
+        Ok(_) => println!("No Q/C issues identified for {cohort_file_name}."),
+        Err(e) => eprint!("Error for {cohort_file_name}: {e}"),
+    }
+    
+    Ok(())
+}
+
diff --git a/bin/commands/mod.rs b/bin/commands/mod.rs
@@ -0,0 +1,6 @@
+pub mod excel;
+pub mod etl;
+pub mod compare;
+pub mod json;
+pub mod removeterm;
+mod util;
diff --git a/bin/commands/removeterm.rs b/bin/commands/removeterm.rs
@@ -0,0 +1,31 @@
+use std::fs::File;
+
+use clap::{Arg, ArgMatches};
+use crate::commands::util::extract_file_name;
+
+pub fn command() -> clap::Command {
+    clap::Command::new("remove-term")
+        .about("Remove HPO Term and its annotations from Cohort Data file")
+        .arg(Arg::new("cohort").short('c').long("cohort").required(true))
+        .arg(Arg::new("hpo-id").short('i').long("id").required(true))
+}
+
+
+pub fn handle(sub_matches: &ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
+    let input_json = sub_matches.get_one::<String>("cohort").unwrap();
+    let hpo_id = sub_matches.get_one::<String>("hpo-id").unwrap();
+    println!("Remove HPO Term {hpo_id} from Cohort {input_json}");
+    let cohort = ga4ghphetools::factory::load_json_cohort(input_json).expect("Could not load Cohort JSON file");
+    let modified_cohort = cohort.remove_hpo_column(hpo_id)?;
+    let fname = extract_file_name(input_json);
+    let outname = format!("modified-{fname}");
+    let json = serde_json::to_string_pretty(&modified_cohort)?;
+    println!("{}", json);
+    let file = File::create(outname)?;
+    let writer = std::io::BufWriter::new(file);
+
+    serde_json::to_writer_pretty(writer, &modified_cohort)?;
+    
+
+    Ok(())
+}
diff --git a/bin/commands/util.rs b/bin/commands/util.rs
@@ -0,0 +1,13 @@
+use std::path::Path;
+
+/// Get the file name at the end of the path
+/// If there is any error, return the original path
+pub(crate) fn extract_file_name(input_path: &str) -> String {
+    let path = Path::new(input_path);
+    if let Some(file_name_os) = path.file_name() {
+        if let Some(file_name) = file_name_os.to_str() {
+            return file_name.to_string();
+        }       
+    }
+   return input_path.to_string();
+}
diff --git a/bin/main.rs b/bin/main.rs
@@ -1,161 +1,34 @@
-// src/main.rs
-use clap::{Arg, ArgMatches, Command};
-use ga4ghphetools::dto::etl_dto::EtlDto;
+mod commands;
+
+use clap::Command;
 use ontolius::{io::OntologyLoaderBuilder, ontology::csr::FullCsrOntology};
 use std::sync::Arc;
 
-
-#[cfg(feature = "excel_export")]
-use ga4ghphetools::export::output_excel_comparison;
-
-
 fn main() {
-    let matches = Command::new("phetools")
+     let mut cmd = Command::new("phetools")
         .about("GA4GH Phenopacket Schema Curation Library Demo")
-        .version(env!("CARGO_PKG_VERSION"))  
-        .subcommand(
-            Command::new("excel")
-                .about("Test loading of legacy Excel template")
-                .arg(Arg::new("template").short('t').long("template").required(true))
-                .arg(Arg::new("hpo").short('o').long("hpo").required(true))
-        )
-        .subcommand(
-            Command::new("json")
-                .about("Test loading of new JSON template")
-                .arg(Arg::new("json").short('i').long("input"))
-                .arg(Arg::new("hpo").short('o').long("hpo").required(true))
-        )
-         .subcommand(
-            Command::new("etl")
-                .about("Test converting an EtlDto to CohortData")
-                .arg(Arg::new("input").short('i').long("input").required(true))
-                .arg(Arg::new("hpo").short('o').long("hpo").required(true))
-        )
-        .subcommand(
-            Command::new("version")
-                .about("Show library version")
-                .arg(Arg::new("version").short('v').long("version"))
-        )
-        .subcommand(
-    Command::new("excel")
-                .about("Compare two cohorts and export to Excel")
-                .arg(Arg::new("cohort1").long("cohort1").required(true))
-                .arg(Arg::new("cohort2").long("cohort2").required(true))
-                .arg(Arg::new("output").long("output").required(true))
-                .arg(Arg::new("hpo").long("hpo").required(true))
-                .arg(
-                    Arg::new("threshold")
-                        .long("threshold")
-                        .default_value("1"),
-                )
-        )
-        .get_matches();
+        .version(env!("CARGO_PKG_VERSION"))
+        .subcommand(commands::excel::command())
+        .subcommand(commands::etl::command())
+        .subcommand(commands::compare::command())
+        .subcommand(commands::json::command())
+        .subcommand(commands::removeterm::command());
+
+    let matches = cmd.clone().get_matches();
+    
     match matches.subcommand() {
-        Some(("excel", sub_matches)) => handle_excel(sub_matches).expect("Could not start excel command"),
-        Some(("json", sub_matches)) => {
-            let input = sub_matches.get_one::<String>("input").unwrap();
-            println!("json: {}", input);
-        },
-        Some(("etl", sub_matches)) => handle_etl(sub_matches).expect("Could not start ETL command"),
-        Some(("version", sub_matches)) => {
-             println!("Version: {}", env!("CARGO_PKG_VERSION"));
-        },
-        Some(("compare", sub_matches)) => {
-            #[cfg(feature = "excel_export")]
-            handle_compare(sub_matches).expect("Excel comparison failed");
-
-            #[cfg(not(feature = "excel_export"))]
-            eprintln!("This binary was built without the `excel_export` feature");
-        }
-        _ => println!("No subcommand was used"),
+        Some(("excel", sub_matches)) => commands::excel::handle(sub_matches).unwrap(),
+        Some(("etl", sub_matches)) => commands::etl::handle(sub_matches).unwrap(),
+        Some(("compare", sub_matches)) => commands::compare::handle(sub_matches).unwrap(),
+        Some(("json", sub_matches)) => commands::json::handle(sub_matches).unwrap(),
+        Some(("remove-term", sub_matches)) => commands::removeterm::handle(sub_matches).unwrap(),
+        _ => cmd.print_help().unwrap(),
     }
-      
-}
-
-#[cfg(feature = "excel_export")]
-fn handle_compare(sub_matches: &ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
-    let cohort_1 = sub_matches
-        .get_one::<String>("cohort1")
-        .expect("cohort1 is required");
-
-    let cohort_2 = sub_matches
-        .get_one::<String>("cohort2")
-        .expect("cohort2 is required");
-
-    let output = sub_matches
-        .get_one::<String>("output")
-        .expect("output is required");
-
-    let hpo_path = sub_matches
-        .get_one::<String>("hpo")
-        .expect("hpo is required");
-
-    let threshold: usize = sub_matches
-        .get_one::<String>("threshold")
-        .unwrap()
-        .parse()?;
-
-    let hpo = load_hpo(hpo_path)?;
-
-    output_excel_comparison(
-        cohort_1,
-        cohort_2,
-        output,
-        hpo,
-        threshold,
-    )
-    .map_err(|e| e.into())
-}
-
-fn handle_excel(sub_matches: &ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
-    let template = sub_matches
-        .get_one::<String>("template")
-        .expect("template argument is required");
-    let hpo = sub_matches
-        .get_one::<String>("hpo")
-        .ok_or("Missing required --hpo argument")?;
-
-    let hpo_arc = load_hpo(hpo)?;
-    test_load_template(hpo_arc, template);
-    Ok(())
-}
-
-fn handle_etl(sub_matches: &ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
-    let input = sub_matches.get_one::<String>("input").unwrap();
-    println!("ETL: {}", input);
-    let hpo = sub_matches
-        .get_one::<String>("hpo")
-        .ok_or("Missing required --hpo argument")?;
-    let hpo_arc = load_hpo(hpo)?;
-    let contents = std::fs::read_to_string(input)
-                    .map_err(|e| format!("Failed to read file: {}", e)).unwrap();
-    let dto: EtlDto = serde_json::from_str(&contents)
-                    .map_err(|e| format!("Failed to deserialize JSON: {}", e)).unwrap(); 
-    
-    let cohort = ga4ghphetools::etl::get_cohort_data_from_etl_dto(hpo_arc.clone(), dto)?;
-    let json = serde_json::to_string_pretty(&cohort).unwrap();
-    println!("{}", json);
-    Ok(())
 }
 
-
-fn load_hpo(json_path: &str) -> Result<Arc<FullCsrOntology>, Box<dyn std::error::Error>> {
+/// Load HPO JSON
+pub fn load_hpo(json_path: &str) -> Result<Arc<FullCsrOntology>, Box<dyn std::error::Error>> {
     let loader = OntologyLoaderBuilder::new().obographs_parser().build();
     let hpo: FullCsrOntology = loader.load_from_path(json_path)?;
     Ok(Arc::new(hpo))
 }
-
-
-
-fn test_load_template(hpo: Arc<FullCsrOntology>, template: &str) {
-    match ga4ghphetools::factory::load_pyphetools_excel_template(template, false, hpo,|p,q|{
-        println!("{}/{} variants validated", p, q);}) {
-        Ok(cohort_dto) => {
-           println!("[INFO] No errors identified for {:?}\n\n\n", cohort_dto);
-        }
-        Err(e) => {
-            println!("[ERROR] {:?}", e);
-            return;
-        }
-    }
-}
diff --git a/src/dto/cohort_dto.rs b/src/dto/cohort_dto.rs
@@ -410,5 +410,24 @@ impl CohortData {
     pub fn phenopackets_schema_version() -> String {
         return PHETOOLS_SCHEMA_VERSION.to_string()
     }
+
+    pub fn set_to_latest_schema_version(&mut self) {
+        self.phetools_schema_version = PHETOOLS_SCHEMA_VERSION.to_string();
+    }
+
+    
+    pub fn remove_hpo_column(&self, tid: &str) -> Result<CohortData, String> {
+        let mut cohort = self.clone();
+        let idx = cohort
+            .hpo_headers
+            .iter()
+            .position(|h| h.hpo_id == tid)
+            .ok_or_else(|| format!("Could not find column that corresponds to {tid}"))?;
+        cohort.hpo_headers.remove(idx);
+        for row in cohort.rows.iter_mut() {
+            row.hpo_data.remove(idx);
+        }
+        return Ok(cohort);
+    }
     
 }
diff --git a/src/repo/qc_report.rs b/src/repo/qc_report.rs
diff --git a/src/variant/intergenic_hgvs_validator.rs b/src/variant/intergenic_hgvs_validator.rs