Skip to content

Commit 036b6dc

Browse files
committed
Genome build identifier patch is optional. Test parsing assembly report logic.
1 parent 5e1c38d commit 036b6dc

File tree

4 files changed

+162
-14
lines changed

4 files changed

+162
-14
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Assembly name: GRCm39
2+
# Description: Genome Reference Consortium Mouse Build 39
3+
# Organism name: Mus musculus (house mouse)
4+
# Infraspecific name: strain=C57BL/6J
5+
# Taxid: 10090
6+
# BioProject: PRJNA20689
7+
# Submitter: Genome Reference Consortium
8+
# Date: 2020-06-24
9+
# Assembly type: haploid
10+
# Release type: major
11+
# Assembly level: Chromosome
12+
# Genome representation: full
13+
# RefSeq category: Reference Genome
14+
# GenBank assembly accession: GCA_000001635.9
15+
# RefSeq assembly accession: GCF_000001635.27
16+
# RefSeq assembly and GenBank assemblies identical: yes
17+
#
18+
## Assembly-Units:
19+
## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name
20+
## GCA_000000055.3 GCF_000000055.20 Primary Assembly (C57BL/6J)
21+
## GCA_000004175.1 GCF_000004175.1 non-nuclear
22+
#
23+
# Ordered by chromosome/plasmid; the chromosomes/plasmids are followed by
24+
# unlocalized scaffolds.
25+
# Unplaced scaffolds are listed at the end.
26+
# RefSeq is equal or derived from GenBank object.
27+
#
28+
# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
29+
1 assembled-molecule 1 Chromosome CM000994.3 = NC_000067.7 C57BL/6J 195154279 na
30+
2 assembled-molecule 2 Chromosome CM000995.3 = NC_000068.8 C57BL/6J 181755017 na
31+
3 assembled-molecule 3 Chromosome CM000996.3 = NC_000069.7 C57BL/6J 159745316 na
32+
4 assembled-molecule 4 Chromosome CM000997.3 = NC_000070.7 C57BL/6J 156860686 na
33+
5 assembled-molecule 5 Chromosome CM000998.3 = NC_000071.7 C57BL/6J 151758149 na
34+
6 assembled-molecule 6 Chromosome CM000999.3 = NC_000072.7 C57BL/6J 149588044 na
35+
7 assembled-molecule 7 Chromosome CM001000.3 = NC_000073.7 C57BL/6J 144995196 na
36+
8 assembled-molecule 8 Chromosome CM001001.3 = NC_000074.7 C57BL/6J 130127694 na
37+
9 assembled-molecule 9 Chromosome CM001002.3 = NC_000075.7 C57BL/6J 124359700 na
38+
10 assembled-molecule 10 Chromosome CM001003.3 = NC_000076.7 C57BL/6J 130530862 na
39+
11 assembled-molecule 11 Chromosome CM001004.3 = NC_000077.7 C57BL/6J 121973369 na
40+
12 assembled-molecule 12 Chromosome CM001005.3 = NC_000078.7 C57BL/6J 120092757 na
41+
13 assembled-molecule 13 Chromosome CM001006.3 = NC_000079.7 C57BL/6J 120883175 na
42+
14 assembled-molecule 14 Chromosome CM001007.3 = NC_000080.7 C57BL/6J 125139656 na
43+
15 assembled-molecule 15 Chromosome CM001008.3 = NC_000081.7 C57BL/6J 104073951 na
44+
16 assembled-molecule 16 Chromosome CM001009.3 = NC_000082.7 C57BL/6J 98008968 na
45+
17 assembled-molecule 17 Chromosome CM001010.3 = NC_000083.7 C57BL/6J 95294699 na
46+
18 assembled-molecule 18 Chromosome CM001011.3 = NC_000084.7 C57BL/6J 90720763 na
47+
19 assembled-molecule 19 Chromosome CM001012.3 = NC_000085.7 C57BL/6J 61420004 na
48+
X assembled-molecule X Chromosome CM001013.3 = NC_000086.8 C57BL/6J 169476592 na
49+
Y assembled-molecule Y Chromosome CM001014.3 = NC_000087.8 C57BL/6J 91455967 na
50+
MMCHR1_RANDOM_CTG1 unlocalized-scaffold 1 Chromosome GL456210.1 = NT_166280.1 C57BL/6J 169725 na
51+
MMCHR1_RANDOM_CTG2 unlocalized-scaffold 1 Chromosome GL456211.1 = NT_166281.1 C57BL/6J 241735 na
52+
MMCHR1_RANDOM_CTG3 unlocalized-scaffold 1 Chromosome GL456212.1 = NT_166282.1 C57BL/6J 153618 na
53+
MMCHR1_RANDOM_CTG5 unlocalized-scaffold 1 Chromosome GL456221.1 = NT_162750.1 C57BL/6J 206961 na
54+
MMCHR1_RANDOM_CTG7 unlocalized-scaffold 1 Chromosome GL456239.1 = NT_166338.1 C57BL/6J 40056 na
55+
MMCHR1_RANDOM_CTG6 unlocalized-scaffold 1 Chromosome MU069434.1 = NW_023337852.1 C57BL/6J 8412 na
56+
MMCHR4UN_CTG6 unlocalized-scaffold 4 Chromosome JH584295.1 = NT_187055.1 C57BL/6J 1976 na
57+
MMCHR5_RANDOM_CTG4 unlocalized-scaffold 5 Chromosome GL456354.1 = NT_166438.1 C57BL/6J 195993 na
58+
MMCHR5_RANDOM_CTG1 unlocalized-scaffold 5 Chromosome JH584296.1 = NT_187056.1 C57BL/6J 199368 na
59+
MMCHR5_RANDOM_CTG2 unlocalized-scaffold 5 Chromosome JH584297.1 = NT_187057.1 C57BL/6J 205776 na
60+
MMCHR5_RANDOM_CTG3 unlocalized-scaffold 5 Chromosome JH584298.1 = NT_187058.1 C57BL/6J 184189 na
61+
MMCHR5_RANDOM_CTG5 unlocalized-scaffold 5 Chromosome JH584299.1 = NT_187059.1 C57BL/6J 953012 na
62+
MMCHR7_RANDOM_CTG1 unlocalized-scaffold 7 Chromosome GL456219.1 = NT_166307.1 C57BL/6J 175968 na
63+
MMCHRX_RANDOM_CTG2 unlocalized-scaffold X Chromosome GL456233.2 = NT_165789.3 C57BL/6J 559103 na
64+
MMCHRY_CTGU1 unlocalized-scaffold Y Chromosome JH584300.1 = NT_187060.1 C57BL/6J 182347 na
65+
MMCHRY_CTGU2 unlocalized-scaffold Y Chromosome JH584301.1 = NT_187061.1 C57BL/6J 259875 na
66+
MMCHRY_CTGU3 unlocalized-scaffold Y Chromosome JH584302.1 = NT_187062.1 C57BL/6J 155838 na
67+
MMCHRY_CTGU4 unlocalized-scaffold Y Chromosome JH584303.1 = NT_187063.1 C57BL/6J 158099 na
68+
MSCHRUN_CTG13 unplaced-scaffold na na GL456359.1 = NT_166443.1 C57BL/6J 22974 na
69+
MSCHRUN_CTG14 unplaced-scaffold na na GL456360.1 = NT_166444.1 C57BL/6J 31704 na
70+
MSCHRUN_CTG21 unplaced-scaffold na na GL456366.1 = NT_166450.1 C57BL/6J 47073 na
71+
MSCHRUN_CTG2 unplaced-scaffold na na GL456367.1 = NT_166451.1 C57BL/6J 42057 na
72+
MSCHRUN_CTG22 unplaced-scaffold na na GL456368.1 = NT_166452.1 C57BL/6J 20208 na
73+
MSCHRUN_CTG19 unplaced-scaffold na na GL456370.1 = NT_166454.1 C57BL/6J 26764 na
74+
MSCHRUN_CTG16 unplaced-scaffold na na GL456372.1 = NT_166456.1 C57BL/6J 28664 na
75+
MSCHRUN_CTG3 unplaced-scaffold na na GL456378.1 = NT_166462.1 C57BL/6J 31602 na
76+
MSCHRUN_CTG20 unplaced-scaffold na na GL456379.1 = NT_166463.1 C57BL/6J 72385 na
77+
MSCHRUN_CTG4 unplaced-scaffold na na GL456381.1 = NT_166465.1 C57BL/6J 25871 na
78+
MSCHRUN_CTG5 unplaced-scaffold na na GL456382.1 = NT_166466.1 C57BL/6J 23158 na
79+
MSCHRUN_CTG6 unplaced-scaffold na na GL456383.1 = NT_166467.1 C57BL/6J 38659 na
80+
MSCHRUN_CTG7 unplaced-scaffold na na GL456385.1 = NT_166469.1 C57BL/6J 35240 na
81+
MSCHRUN_CTG17 unplaced-scaffold na na GL456387.1 = NT_166471.1 C57BL/6J 24685 na
82+
MSCHRUN_CTG18 unplaced-scaffold na na GL456389.1 = NT_166473.1 C57BL/6J 28772 na
83+
MSCHRUN_CTG9 unplaced-scaffold na na GL456390.1 = NT_166474.1 C57BL/6J 24668 na
84+
MSCHRUN_CTG10 unplaced-scaffold na na GL456392.1 = NT_166476.1 C57BL/6J 23629 na
85+
MSCHRUN_CTG12 unplaced-scaffold na na GL456394.1 = NT_166478.1 C57BL/6J 24323 na
86+
MSCHRUN_CTG15 unplaced-scaffold na na GL456396.1 = NT_166480.1 C57BL/6J 21240 na
87+
MSCHRUN_CTG23 unplaced-scaffold na na JH584304.1 = NT_187064.1 C57BL/6J 114452 na
88+
MSCHRUN_CTG24 unplaced-scaffold na na MU069435.1 = NW_023337853.1 C57BL/6J 31129 na
89+
MT assembled-molecule MT Mitochondrion AY172335.1 = NC_005089.1 non-nuclear 16299 chrM

src/builds.rs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ pub fn get_grch37_p13<C>() -> GenomeBuild<C>
2222
where
2323
C: FromStr + Zero + PartialOrd,
2424
{
25-
let id = GenomeBuildIdentifier::new("GRCh37".into(), "p13".into());
25+
let id = GenomeBuildIdentifier::from(("GRCh37", "p13"));
2626
parse_assembly_report(id, GRCh37_p13).expect("Reading builtin GRCh37.p13 assembly report")
2727
}
2828

@@ -31,7 +31,7 @@ pub fn get_grch38_p13<C>() -> GenomeBuild<C>
3131
where
3232
C: FromStr + Zero + PartialOrd,
3333
{
34-
let id = GenomeBuildIdentifier::new("GRCh38".into(), "p13".into());
34+
let id = GenomeBuildIdentifier::from(("GRCh38", "p13"));
3535
parse_assembly_report(id, GRCh38_p13).expect("Reading builtin GRCh38.p13 assembly report")
3636
}
3737

@@ -91,15 +91,21 @@ where
9191
// Accessions:
9292
// GenBank, column #4
9393
if let Some(&gen_bank) = fields.get(4) {
94-
alt_names.push(gen_bank);
94+
if gen_bank != "na" {
95+
alt_names.push(gen_bank);
96+
}
9597
};
9698
// RefSeq, column #6
9799
if let Some(&refseq) = fields.get(6) {
98-
alt_names.push(refseq);
100+
if refseq != "na" {
101+
alt_names.push(refseq);
102+
}
99103
};
100104
// UCSC, column #9
101105
if let Some(&ucsc) = fields.get(9) {
102-
alt_names.push(ucsc);
106+
if ucsc != "na" {
107+
alt_names.push(ucsc);
108+
}
103109
};
104110

105111
// Length

src/genome.rs

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
* Contig
77
* ***************************************************************************************************************** */
88

9+
use std::str::FromStr;
10+
911
use num_traits::Zero;
1012

1113
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
@@ -57,22 +59,41 @@ where
5759
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
5860
pub struct GenomeBuildIdentifier {
5961
major_assembly: String,
60-
patch: String,
62+
patch: Option<String>,
6163
}
6264

63-
impl GenomeBuildIdentifier {
64-
pub fn new(major_assembly: String, patch: String) -> Self {
65+
/// Create identifier from a string.
66+
///
67+
/// Infallible.
68+
impl FromStr for GenomeBuildIdentifier {
69+
type Err = String;
70+
71+
fn from_str(s: &str) -> Result<Self, Self::Err> {
72+
Ok(GenomeBuildIdentifier {
73+
major_assembly: s.to_string(),
74+
patch: None,
75+
})
76+
}
77+
}
78+
79+
impl<T> From<(T, T)> for GenomeBuildIdentifier
80+
where
81+
T: ToString,
82+
{
83+
fn from(value: (T, T)) -> Self {
6584
GenomeBuildIdentifier {
66-
major_assembly,
67-
patch,
85+
major_assembly: value.0.to_string(),
86+
patch: Some(value.1.to_string()),
6887
}
6988
}
89+
}
7090

91+
impl GenomeBuildIdentifier {
7192
pub fn major_assembly(&self) -> &str {
7293
&self.major_assembly
7394
}
74-
pub fn patch(&self) -> &str {
75-
&self.patch
95+
pub fn patch(&self) -> Option<&str> {
96+
self.patch.as_deref()
7697
}
7798
}
7899

tests/test_builds.rs

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
use dabuild::builds::*;
1+
use std::{error::Error, fs::File, io::BufReader, str::FromStr};
2+
3+
use dabuild::{builds::*, GenomeBuild, GenomeBuildIdentifier};
24

35
#[test]
46
fn grch38_p13() {
57
let build = get_grch38_p13::<usize>();
68

79
assert_eq!(build.id().major_assembly(), "GRCh38");
8-
assert_eq!(build.id().patch(), "p13");
10+
assert_eq!(build.id().patch(), Some("p13"));
911
assert_eq!(build.contigs().len(), 640);
1012

1113
let contig = build.contig_by_name("chr1");
@@ -22,3 +24,33 @@ fn grch38_p13() {
2224

2325
assert_eq!(contig.length(), &248_956_422usize);
2426
}
27+
28+
#[test]
29+
fn test_parse_assembly_report() -> Result<(), Box<dyn Error>> {
30+
let path = "data/GCF_000001635.27_GRCm39_assembly_report.txt";
31+
let read = BufReader::new(File::open(path)?);
32+
let build = parse_assembly_report(GenomeBuildIdentifier::from_str("GRCm39").unwrap(), read);
33+
34+
assert!(build.is_ok());
35+
let build: GenomeBuild<u32> = build?;
36+
37+
assert_eq!(build.id().major_assembly(), "GRCm39");
38+
assert_eq!(build.id().patch(), None);
39+
assert_eq!(build.contigs().len(), 61);
40+
41+
let contig = build.contig_by_name("Y");
42+
assert!(contig.is_some());
43+
44+
let contig = contig.unwrap();
45+
assert_eq!(contig.name(), "Y");
46+
47+
let alt: Vec<_> = contig.alt_names().collect();
48+
assert_eq!(alt.len(), 2);
49+
assert!(["CM001014.3", "NC_000087.8"]
50+
.iter()
51+
.all(|x| alt.contains(x)));
52+
53+
assert_eq!(contig.length(), &91_455_967u32);
54+
55+
Ok(())
56+
}

0 commit comments

Comments
 (0)