Skip to content

Commit ef07fd0

Browse files
committed
refactoring repo qc
1 parent 03bc6bf commit ef07fd0

File tree

6 files changed

+109
-51
lines changed

6 files changed

+109
-51
lines changed

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "ga4ghphetools"
3-
version = "0.5.2"
3+
version = "0.5.3"
44
edition = "2021"
55
keywords = ["GA4GH", "Phenopacket Schema", "Human Phenotype Ontology"]
66
description = "Generate GA4GH phenopackets from tabular data"
@@ -36,7 +36,7 @@ rand = "0.9.2"
3636
regex = "1.12.1"
3737
serde = { version = "1.0.228", features = ["derive"] }
3838
zip = "6.0.0"
39-
reqwest = {version = "0.12.28", features = ["blocking", "json"]}
39+
reqwest = {version = "0.13.1", features = ["blocking", "json"]}
4040
serde_json = { version = "1.0.148", features = ["preserve_order"] }
4141
clap = { version = "4.5.53", features = ["derive"], optional = true }
4242
rayon = "=1.10.0"

src/repo/cohort_qc.rs

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ impl CohortQc {
2929
} else {
3030
// by design, Mendelian can only have one disease
3131
let ddata = &cohort.disease_list[0];
32-
let dqc = DiseaseQc::new(ddata);
32+
let dqc = DiseaseQc::new(ddata, &cohort);
3333
disease_to_ppkt_d.insert(ddata.disease_id.clone(), dqc);
3434
}
3535
}
@@ -56,6 +56,25 @@ impl CohortQc {
5656
})
5757
}
5858

59+
60+
pub fn get_errors(&self) -> Vec<QcReport> {
61+
let mut errs: Vec<QcReport> = self
62+
.unexpected_files
63+
.iter()
64+
.map(|file| QcReport::unexpected_file(&self.cohort_name, file))
65+
.collect();
66+
67+
errs.extend(self.check_moi());
68+
69+
errs.extend(
70+
self.disease_qc_list
71+
.iter()
72+
.filter_map(|dqc| dqc.check_all_rows_output_as_ppkt()),
73+
);
74+
75+
errs
76+
}
77+
5978

6079
fn get_disease_id(ppkt: &Phenopacket) -> Result<String, String> {
6180
if ppkt.diseases.len() != 1 {
@@ -67,27 +86,16 @@ impl CohortQc {
6786
}
6887
}
6988

70-
pub fn has_unexpected_files(&self) -> QcReport {
71-
if self.unexpected_files.is_empty() {
72-
QcReport::no_unepected_files(&self.cohort_name)
73-
} else {
74-
QcReport::unexpected_files(&self.cohort_name, &self.unexpected_files)
75-
}
76-
}
89+
7790

7891
pub fn ppkt_count(&self) -> usize {
7992
return self.disease_qc_list.iter().map(|dqc|dqc.phenopacket_count()).sum();
8093
}
8194

82-
pub fn check_moi(&self) -> usize {
83-
let mut misfit = 0 as usize;
84-
for dqc in &self.disease_qc_list {
85-
if let Some(output) = dqc.check_moi() {
86-
println!("{}", output);
87-
misfit += 1;
88-
}
89-
}
90-
misfit
95+
pub fn check_moi(&self) -> Vec<QcReport> {
96+
self.disease_qc_list.iter()
97+
.flat_map(|dqc|dqc.check_moi())
98+
.collect()
9199
}
92100

93101

src/repo/disease_qc.rs

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,24 @@ use std::collections::HashSet;
22

33
use phenopackets::schema::v2::{Phenopacket, core::genomic_interpretation::Call};
44

5-
use crate::dto::cohort_dto::{DiseaseData};
5+
use crate::{dto::cohort_dto::{CohortData, DiseaseData}, repo::qc_report::QcReport};
66

77

88

99
#[derive(Clone, Debug)]
1010
pub struct DiseaseQc {
1111
disease_data: DiseaseData,
12-
ppkt_list: Vec<Phenopacket>
12+
ppkt_list: Vec<Phenopacket>,
13+
cohort: CohortData,
1314
}
1415

1516

1617
impl DiseaseQc {
17-
pub fn new(disease_data: &DiseaseData) -> Self {
18+
pub fn new(disease_data: &DiseaseData, cohort: &CohortData) -> Self {
1819
Self {
1920
disease_data: disease_data.clone(),
20-
ppkt_list: Vec::new()
21+
ppkt_list: Vec::new(),
22+
cohort: cohort.clone()
2123
}
2224
}
2325

@@ -30,7 +32,8 @@ impl DiseaseQc {
3032
return self.ppkt_list.len();
3133
}
3234

33-
pub fn check_moi(&self) -> Option<String> {
35+
pub fn check_moi(&self) -> Vec<QcReport> {
36+
let mut errs: Vec<QcReport> = Vec::new();
3437
let mut allowable_allele_counts: HashSet<usize> = HashSet::new();
3538
for moi in &self.disease_data.mode_of_inheritance_list {
3639
if moi.is_autosomal_dominant() {
@@ -40,16 +43,17 @@ impl DiseaseQc {
4043
} else if moi.is_x_chromosomal() {
4144
allowable_allele_counts.insert(1);
4245
} else {
43-
return Some(format!("Did not recognize MOI: {:?}", moi));
46+
eprintln!("Did not recognize MOI: {:?}", moi);
4447
}
4548
}
4649
for ppkt in &self.ppkt_list {
4750
let ac = Self::get_allele_count(ppkt);
4851
if ! allowable_allele_counts.contains(&ac) {
49-
return Some(format!("{}: Expected counts of {:?} but got {} for {}.", ppkt.id,allowable_allele_counts, ac, self.disease_data_display()))
52+
let qc = QcReport::moi_mismatch(&self.disease_data_display(), &ppkt.id, &allowable_allele_counts, ac);
53+
errs.push(qc);
5054
}
5155
}
52-
None
56+
errs
5357
}
5458

5559
fn disease_data_display(&self) -> String {
@@ -87,4 +91,15 @@ impl DiseaseQc {
8791

8892
ac
8993
}
94+
95+
pub fn check_all_rows_output_as_ppkt(&self) -> Option<QcReport> {
96+
let n_nrows = self.cohort.rows.len();
97+
let n_phenopackets = self.phenopacket_count();
98+
if n_nrows == n_phenopackets {
99+
return None;
100+
} else {
101+
return Some(QcReport::count_mismatch(&self.disease_data_display(), n_nrows, n_phenopackets))
102+
}
103+
}
104+
90105
}

src/repo/gpt_repository.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ impl GptRepository {
4343
.map(|cl| cl.get_cohort_qc())
4444
.collect();
4545
match result {
46-
Ok(cohort_qc_list) => Ok(RepoQc::new(cohort_qc_list)),
46+
Ok(cohort_qc_list) => Ok(RepoQc::new(&self.path, cohort_qc_list)),
4747
Err(e) => Err(e),
4848
}
4949
}
@@ -81,9 +81,7 @@ mod tests {
8181
fn test_repo(repo_path: String) {
8282
let repo = GptRepository::new(&repo_path);
8383
let repoqc = repo.repo_qc().unwrap();
84-
let count = repoqc.phenopacket_count();
85-
println!("Total phenopackets: {}", count);
86-
repoqc.check_moi();
84+
8785
}
8886

8987

src/repo/qc_report.rs

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use std::collections::HashSet;
2+
13

24

35
#[derive(Debug, Clone, serde::Serialize)] // Serialize helps if passing to a web-based GUI
@@ -11,21 +13,41 @@ pub struct QcReport {
1113

1214

1315
impl QcReport {
14-
pub fn no_unepected_files(cohort_name: &str) -> Self {
15-
Self {
16-
cohort_name: cohort_name.to_string(),
17-
message: "No unexpected files".to_string(),
18-
is_ok: true }
19-
}
16+
2017

21-
pub fn unexpected_files(cohort_name: &str, unexpected: &Vec<String>) -> Self {
22-
let msg = format!("Unexpected files: {}", unexpected.join("; "));
18+
pub fn unexpected_file(cohort_name: &str, unexpected: &str) -> Self {
19+
let msg = format!("Unexpected file: {}", unexpected);
2320
Self { cohort_name: cohort_name.to_string(),
2421
message: msg,
2522
is_ok: false
2623
}
2724
}
2825

26+
pub fn moi_mismatch(cohort_name: &str, ppkt_id: &str, allowable_allele_counts: &HashSet<usize>, ac: usize) -> Self {
27+
let set = format!(
28+
"{{{}}}",
29+
allowable_allele_counts
30+
.iter()
31+
.map(|n| n.to_string())
32+
.collect::<Vec<_>>()
33+
.join(",")
34+
);
35+
let message= format!("Expected counts of {} but got {} for {}.", set,ac, ppkt_id);
36+
Self { cohort_name: cohort_name.to_string(),
37+
message,
38+
is_ok: false
39+
}
40+
}
41+
42+
pub fn count_mismatch(cohort_name: &str, n_nrows: usize, n_phenopackets: usize) -> Self {
43+
let message = format!("Rows: {} - exported phenopackets: {}", n_nrows, n_phenopackets);
44+
Self {
45+
cohort_name: cohort_name.to_string(),
46+
message,
47+
is_ok: false,
48+
}
49+
}
50+
2951

3052

3153
}

src/repo/repo_qc.rs

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,50 @@
1-
use crate::repo::{cohort_qc::CohortQc};
1+
use std::path::PathBuf;
2+
3+
use crate::repo::{cohort_qc::CohortQc, qc_report::QcReport};
24

35

46

57

68
pub struct RepoQc {
7-
cohort_qc_list: Vec<CohortQc>
9+
pub repo_path: String,
10+
pub cohort_count: usize,
11+
pub phenopacket_count: usize,
12+
pub errors: Vec<QcReport>
813
}
914

1015

1116
impl RepoQc {
12-
pub fn new(cohort_qc_list: Vec<CohortQc>) -> Self {
17+
pub fn new(repository_path: &PathBuf, cohort_qc_list: Vec<CohortQc>) -> Self {
18+
let phenopacket_count = Self::phenopacket_count(&cohort_qc_list);
19+
let cohort_count = cohort_qc_list.len();
20+
let repo_path: String = repository_path.to_string_lossy().to_string();
21+
let errors = Self::get_errors(cohort_qc_list);
1322
Self {
14-
cohort_qc_list
23+
repo_path,
24+
cohort_count,
25+
phenopacket_count,
26+
errors
1527
}
1628
}
1729

18-
pub fn phenopacket_count(&self) -> usize {
30+
pub fn phenopacket_count(cohort_qc_list: &Vec<CohortQc>) -> usize {
1931
let mut c = 0 as usize;
20-
for cohort in &self.cohort_qc_list {
32+
for cohort in cohort_qc_list {
2133
c += cohort.ppkt_count();
2234
}
2335
return c;
2436
}
2537

26-
pub fn check_moi(&self) {
27-
println!("Checking agreement of MOI and allele counts");
28-
let mut misfit = 0 as usize;
29-
for cohort in &self.cohort_qc_list {
30-
misfit += cohort.check_moi();
38+
fn get_errors(cohort_qc_list: Vec<CohortQc>) -> Vec<QcReport> {
39+
let mut errs: Vec<QcReport> = Vec::new();
40+
for cohort in cohort_qc_list {
41+
let cohort_errs = cohort.get_errors();
42+
errs.extend(cohort_errs);
3143
}
32-
println!("Found misalignment in {} cohorts", misfit);
44+
45+
errs
3346
}
3447

48+
49+
3550
}

0 commit comments

Comments
 (0)