diff --git a/README.md b/README.md index e103560a..37ebebaa 100644 --- a/README.md +++ b/README.md @@ -36,21 +36,9 @@ From the root directory, run: Or to skip retrieve and manubot stages, which will speed things up substantially: `snakemake --config stages="skip-refs"` -### Update TRGT genotyping catalogs +### Making/updating genotyper catalogs -``` -python scripts/make-catalog.py -g hg38 -f TRGT data/STRchive-loci.json data/STRchive-disease-loci.hg38.TRGT.bed -python scripts/make-catalog.py -g T2T -f TRGT data/STRchive-loci.json data/STRchive-disease-loci.T2T-chm13.TRGT.bed -python scripts/make-catalog.py -g hg19 -f TRGT data/STRchive-loci.json data/STRchive-disease-loci.hg19.TRGT.bed -``` - -### Update extended BED files - -``` -python scripts/make-catalog.py -f bed -g hg38 data/STRchive-loci.json data/STRchive-disease-loci.hg38.bed -python scripts/make-catalog.py -f bed -g T2T data/STRchive-loci.json data/STRchive-disease-loci.T2T-chm13.bed -python scripts/make-catalog.py -f bed -g hg19 data/STRchive-loci.json data/STRchive-disease-loci.hg19.bed -``` +See `workflow/Snakefile` for example commands ### Install dependencies @@ -67,5 +55,22 @@ conda env update --file scripts/environment.yml --prune conda activate strchive ``` - Note: biomaRt isn't playing nicely with conda, so installing it within the R script where it is used. + + +## Using STRchive catalogs + +### LongTR + +A sample command using [LongTR](https://github.com/gymrek-lab/LongTR) to genotype the STRchive catalog in Oxford Nanoport data. The alignment parameters were suggested in https://github.com/gymrek-lab/LongTR/issues/21. The genotyping accuracy has not been assessed. + +``` +module load gcc # or otherwise satisfy this dependency +LongTR \ + --max-tr-len 10000 \ # largest locus in STRChive currently ~4000 bp + --alignment-params -1.0,-0.458675,-1.0,-0.458675,-0.00005800168,-1,-1 \ + --fasta human_GRCh38_no_alt_analysis_set.fasta \ + --regions STRchive-disease-loci.hg38.longTR.bed \ + --bams sample.bam \ + --tr-vcf sample.longTR.vcf.gz +``` diff --git a/data/STRchive-loci.json b/data/STRchive-loci.json index 5f04ca8e..553a260b 100644 --- a/data/STRchive-loci.json +++ b/data/STRchive-loci.json @@ -19,8 +19,7 @@ "unknown_motif_gene_orientation": [], "disease": "Oculopharyngodistal myopathy type 5", "gene": "ABCD3", - "flank_motif": null, - "locus_structure": "(GCC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "5' UTR", "location_in_gene": null, @@ -83,8 +82,7 @@ "unknown_motif_gene_orientation": [], "disease": "Fragile X syndrome, FRAXE type", "gene": "AFF2", - "flank_motif": null, - "locus_structure": "(GCC)*", + "locus_structure": [], "inheritance": ["XR"], "type": "5' UTR", "location_in_gene": null, @@ -147,8 +145,7 @@ "unknown_motif_gene_orientation": [], "disease": "Intellectual disability associated with fragile site FRA2A", "gene": "AFF3", - "flank_motif": null, - "locus_structure": "(GCC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 3", @@ -211,8 +208,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinal and bulbar muscular atrophy, Kennedy Disease", "gene": "AR", - "flank_motif": null, - "locus_structure": "(GCA)*", + "locus_structure": [], "inheritance": ["XR"], "type": "Coding", "location_in_gene": "Exon 1", @@ -275,8 +271,7 @@ "unknown_motif_gene_orientation": [], "disease": "Early-infantile epileptic encephalopathy", "gene": "ARX", - "flank_motif": null, - "locus_structure": "(NGC)*", + "locus_structure": [], "inheritance": ["XR"], "type": "Coding", "location_in_gene": "Exon 2, aa 110-115", @@ -339,8 +334,7 @@ "unknown_motif_gene_orientation": [], "disease": "Partington syndrome", "gene": "ARX", - "flank_motif": null, - "locus_structure": "(NGC)*", + "locus_structure": [], "inheritance": ["XR"], "type": "Coding", "location_in_gene": "Exon 2, aa 144-155", @@ -403,8 +397,7 @@ "unknown_motif_gene_orientation": [], "disease": "Dentatorubral-Pallidoluysian Atrophy", "gene": "ATN1", - "flank_motif": null, - "locus_structure": "(CAG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 5", @@ -467,8 +460,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia type 1", "gene": "ATXN1", - "flank_motif": null, - "locus_structure": "(CTG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 8", @@ -531,8 +523,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia type 10", "gene": "ATXN10", - "flank_motif": null, - "locus_structure": "(ATTCT)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 9", @@ -595,8 +586,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia type 2", "gene": "ATXN2", - "flank_motif": null, - "locus_structure": "(CTG)*", + "locus_structure": [], "inheritance": ["AD", "AR"], "type": "Coding", "location_in_gene": "Exon 1", @@ -659,8 +649,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia type 3/Machado-Joseph disease", "gene": "ATXN3", - "flank_motif": null, - "locus_structure": "(CTG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 10", @@ -723,8 +712,17 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia type 7", "gene": "ATXN7", - "flank_motif": "(CAG)n(CCG)4", - "locus_structure": "(CAG)*(CCG)+", + "locus_structure": [ + { + "motif": "CAG", + "count": null, + "type": "pathogenic_repeat" + }, + { + "motif": "CCG", + "count": 4, + "type": "flank_repeat" + }], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 1, 2, or 3 (depending on isoform)", @@ -787,8 +785,17 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia type 8", "gene": "ATXN8OS", - "flank_motif": "(CTA)10(CTG)n", - "locus_structure": "(CTA)*(CTG)*", + "locus_structure": [ + { + "motif": "CTA", + "count": 10, + "type": "flank_repeat" + }, + { + "motif": "CTG", + "count": null, + "type": "pathogenic_repeat" + }], "inheritance": ["AD"], "type": "Coding/3' UTR", "location_in_gene": "Exon 1 or 3' UTR depending on transcript", @@ -851,8 +858,7 @@ "unknown_motif_gene_orientation": ["AAAAA", "AAAAC", "AAATG", "AAAAG", "AAGAT", "AAACT", "AACAT", "AATAC", "AAATC", "AATGC"], "disease": "Spinocerebellar ataxia type 31", "gene": "BEAN1", - "flank_motif": null, - "locus_structure": "(TGGAA)*(TAGAA)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 4/4", @@ -915,8 +921,7 @@ "unknown_motif_gene_orientation": [], "disease": "Frontotemporal dementia (FTD) and/or amyotrophic lateral sclerosis (ALS)", "gene": "C9orf72", - "flank_motif": null, - "locus_structure": "(GGCCCC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "5' UTR/Intronic", "location_in_gene": "Intron 1 or 5' UTR depending on transcript", @@ -979,8 +984,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia type 6", "gene": "CACNA1A", - "flank_motif": null, - "locus_structure": "(CTG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Last Exon: 47 or 48", @@ -1043,8 +1047,7 @@ "unknown_motif_gene_orientation": [], "disease": "Jacobsen syndrome (FRAX11B fragile site)", "gene": "CBL", - "flank_motif": null, - "locus_structure": "(CGG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "5' UTR", "location_in_gene": null, @@ -1107,8 +1110,22 @@ "unknown_motif_gene_orientation": ["CTGT"], "disease": "Myotonic dystrophy type 2", "gene": "CNBP", - "flank_motif": "(CAGG)n(CAGA)10(CA)19", - "locus_structure": "(CAGG)*(CAGA)*(CA)*", + "locus_structure": [ + { + "motif": "CAGG", + "count": null, + "type": "pathogenic_repeat" + }, + { + "motif": "CAGA", + "count": 10, + "type": "flank_repeat" + }, + { + "motif": "CA", + "count": 19, + "type": "flank_repeat" + }], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 1", @@ -1171,8 +1188,7 @@ "unknown_motif_gene_orientation": [], "disease": "Multiple epiphyseal dysplasia, Pseudoachondroplasia", "gene": "COMP", - "flank_motif": null, - "locus_structure": "(GTC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 13", @@ -1235,8 +1251,7 @@ "unknown_motif_gene_orientation": [], "disease": "Progressive Myoclonic Epilepsy Type 1 (EPM1) Unverricht-Lundborg Disease (ULD)", "gene": "CSTB", - "flank_motif": null, - "locus_structure": "(CGCGGGGCGGGG)*", + "locus_structure": [], "inheritance": ["AR"], "type": "5' UTR", "location_in_gene": null, @@ -1299,8 +1314,17 @@ "unknown_motif_gene_orientation": ["TTTTT"], "disease": "Spinocerebellar ataxia type 37", "gene": "DAB1", - "flank_motif": null, - "locus_structure": "(AAAAT)*(GAAAT)*(AAAAT)*", + "locus_structure": [ + { + "motif": "AAAAT", + "count": 7, + "type": "internal_repeat" + }, + { + "motif": "GAAAT", + "count": null, + "type": "pathogenic_repeat" + }], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 1 (most isoforms)", @@ -1363,8 +1387,7 @@ "unknown_motif_gene_orientation": [], "disease": "Intellectual developmental disorder, FRA12A type", "gene": "DIP2B", - "flank_motif": null, - "locus_structure": "(GGC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "5' UTR", "location_in_gene": null, @@ -1427,8 +1450,17 @@ "unknown_motif_gene_orientation": [], "disease": "Duchenne muscular dystrophy", "gene": "DMD", - "flank_motif": "(TTC)n(T)8", - "locus_structure": "(TTC)*(T)*", + "locus_structure": [ + { + "motif": "TTC", + "count": null, + "type": "pathogenic_repeat" + }, + { + "motif": "T", + "count": 8, + "type": "flank_repeat" + }], "inheritance": ["XR"], "type": "Intronic", "location_in_gene": "Intron 62", @@ -1491,8 +1523,7 @@ "unknown_motif_gene_orientation": [], "disease": "Myotonic dystrophy type 1", "gene": "DMPK", - "flank_motif": null, - "locus_structure": "(CAG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "3' UTR", "location_in_gene": null, @@ -1551,12 +1582,11 @@ "pathogenic_motif_gene_orientation": ["ACAGCGAGGTCGGCAGCGGC"], "benign_motif_reference_orientation": [], "benign_motif_gene_orientation": [], - "unknown_motif_reference_orientation": [], - "unknown_motif_gene_orientation": [], + "unknown_motif_reference_orientation": ["CCTCGCTGCGCCGCTGCCGA"], + "unknown_motif_gene_orientation": ["AGCGAGGTCGGCAGCGGCGC"], "disease": "Richieri-Costa-Pereira syndrome", "gene": "EIF4A3", - "flank_motif": null, - "locus_structure": "(CCTCGCTGCGCCGCTGCCGA)*(CCTCGCTGTGCCGCTGCCGA)*", + "locus_structure": [], "inheritance": ["AR"], "type": "5' UTR", "location_in_gene": null, @@ -1619,8 +1649,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia 27B", "gene": "FGF14", - "flank_motif": null, - "locus_structure": "(GAA)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 1", @@ -1683,8 +1712,7 @@ "unknown_motif_gene_orientation": [], "disease": "Fragile X syndrome (FXS), fragile X-associated tremor/ataxia syndrome (FXTAS), and fragile X-associated primary ovarian insufficiency FXPOI/POF1", "gene": "FMR1", - "flank_motif": null, - "locus_structure": "(CGG)*", + "locus_structure": [], "inheritance": ["XD"], "type": "5' UTR", "location_in_gene": null, @@ -1747,8 +1775,7 @@ "unknown_motif_gene_orientation": [], "disease": "Blepharophimosis, epicanthus inversus, and ptosis", "gene": "FOXL2", - "flank_motif": null, - "locus_structure": "(NGC)*", + "locus_structure": [], "inheritance": ["AD", "AR"], "type": "Coding", "location_in_gene": "Exon 1", @@ -1811,8 +1838,17 @@ "unknown_motif_gene_orientation": [], "disease": "Friedreich ataxia", "gene": "FXN", - "flank_motif": "(A)16(GAA)n", - "locus_structure": "(A)*(GAA)*", + "locus_structure": [ + { + "motif": "A", + "count": 16, + "type": "flank_repeat" + }, + { + "motif": "GAA", + "count": null, + "type": "pathogenic_repeat" + }], "inheritance": ["AR"], "type": "Intronic", "location_in_gene": "Intron 1", @@ -1875,8 +1911,7 @@ "unknown_motif_gene_orientation": [], "disease": "Oculopharyngodistal myopathy type 2", "gene": "GIPC1", - "flank_motif": null, - "locus_structure": "(CCG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "5' UTR", "location_in_gene": null, @@ -1939,8 +1974,7 @@ "unknown_motif_gene_orientation": [], "disease": "Glutaminase deficiency", "gene": "GLS", - "flank_motif": null, - "locus_structure": "(GCA)*", + "locus_structure": [], "inheritance": ["AR"], "type": "5' UTR", "location_in_gene": null, @@ -2003,8 +2037,7 @@ "unknown_motif_gene_orientation": [], "disease": "Hand-foot-genital syndrome 1", "gene": "HOXA13", - "flank_motif": null, - "locus_structure": "(NGC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 1", @@ -2067,8 +2100,7 @@ "unknown_motif_gene_orientation": [], "disease": "Hand-foot-genital syndrome 2", "gene": "HOXA13", - "flank_motif": null, - "locus_structure": "(NGC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 1", @@ -2131,8 +2163,7 @@ "unknown_motif_gene_orientation": [], "disease": "Hand-foot-genital syndrome 3", "gene": "HOXA13", - "flank_motif": null, - "locus_structure": "(NGC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 1", @@ -2195,8 +2226,7 @@ "unknown_motif_gene_orientation": [], "disease": "Syndactyly", "gene": "HOXD13", - "flank_motif": null, - "locus_structure": "(GCN)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 1", @@ -2259,8 +2289,22 @@ "unknown_motif_gene_orientation": [], "disease": "Huntington disease", "gene": "HTT", - "flank_motif": "(CAG)nCAACAG(CCG)12", - "locus_structure": "(CAG)*CAACAG(CCG)*", + "locus_structure": [ + { + "motif": "CAG", + "count": null, + "type": "pathogenic_repeat" + }, + { + "motif": "CAACAG", + "count": 1, + "type": "interruption" + }, + { + "motif": "CCG", + "count": 12, + "type": "flank_repeat" + }], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 1", @@ -2323,8 +2367,7 @@ "unknown_motif_gene_orientation": [], "disease": "Huntington disease-like 2", "gene": "JPH3", - "flank_motif": null, - "locus_structure": "(CTG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 2", @@ -2387,8 +2430,7 @@ "unknown_motif_gene_orientation": [], "disease": "Oculopharyngodistal myopathy type 1", "gene": "LRP12", - "flank_motif": null, - "locus_structure": "(CGC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "5' UTR", "location_in_gene": null, @@ -2451,8 +2493,7 @@ "unknown_motif_gene_orientation": ["ATGTT", "AGTTT", "GTTTT", "TTTTT"], "disease": "Familial adult myoclonic epilepsy type 3", "gene": "MARCHF6", - "flank_motif": null, - "locus_structure": "(TTTTA)*(TTTCA)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 1", @@ -2515,8 +2556,7 @@ "unknown_motif_gene_orientation": [], "disease": "Nongoitrous congenital hypothyroidism-3", "gene": "MIR7-2", - "flank_motif": null, - "locus_structure": "(TTTG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Non-coding", "location_in_gene": null, @@ -2579,8 +2619,7 @@ "unknown_motif_gene_orientation": [], "disease": "Autosomal dominant tubulointerstitial kidney disease", "gene": "MUC1", - "flank_motif": null, - "locus_structure": "(GCCCACGGTGTCACCTCGGCCCCGGACACCAGGCCGGCCCCGGGCTCCACCGCCCCCCCCA)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 2", @@ -2643,8 +2682,7 @@ "unknown_motif_gene_orientation": [], "disease": "NAXE-related mitochondrial encephalopathy", "gene": "NAXE", - "flank_motif": null, - "locus_structure": "(GGGCC)*", + "locus_structure": [], "inheritance": ["AR"], "type": "5' UTR", "location_in_gene": null, @@ -2707,8 +2745,7 @@ "unknown_motif_gene_orientation": [], "disease": "Amyotrophic lateral sclerosis", "gene": "NIPA1", - "flank_motif": null, - "locus_structure": "(GCG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 1/Intron 1 depending on transcript", @@ -2771,8 +2808,17 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia type 36", "gene": "NOP56", - "flank_motif": "(GGCCTG)n(CGCCTG)3", - "locus_structure": "(GGCCTG)*(CGCCTG)*", + "locus_structure": [ + { + "motif": "GGCCTG", + "count": null, + "type": "pathogenic_repeat" + }, + { + "motif": "CGCCTG", + "count": 3, + "type": "flank_repeat" + }], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 1", @@ -2835,8 +2881,7 @@ "unknown_motif_gene_orientation": [], "disease": "Neuronal intranuclear inclusion disease, Alzheimer disease and parkinsonism phenotype, Oculopharyngodistal myopathy (OPDM) type 3", "gene": "NOTCH2NLC", - "flank_motif": null, - "locus_structure": "(GGC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "5' UTR", "location_in_gene": null, @@ -2899,8 +2944,7 @@ "unknown_motif_gene_orientation": [], "disease": "Oculopharyngeal myopathy with leukoencephalopathy 1", "gene": "NUTM2B-AS1", - "flank_motif": null, - "locus_structure": "(GGC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "lncRNA", "location_in_gene": "Exon 1 (noncoding)", @@ -2963,8 +3007,7 @@ "unknown_motif_gene_orientation": [], "disease": "Oculopharyngeal muscular dystrophy", "gene": "PABPN1", - "flank_motif": null, - "locus_structure": "(GCN)*", + "locus_structure": [], "inheritance": ["AD", "AR"], "type": "Coding", "location_in_gene": "Exon 1", @@ -3027,8 +3070,7 @@ "unknown_motif_gene_orientation": [], "disease": "Congenital central hypoventilation syndrome", "gene": "PHOX2B", - "flank_motif": null, - "locus_structure": "(GCN)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 3", @@ -3091,8 +3133,7 @@ "unknown_motif_gene_orientation": [], "disease": "Myopathy with Rimmed Ubiquitin-Positive Autophagic Vacuolation, PLIN4-Related Myopathy", "gene": "PLIN4", - "flank_motif": null, - "locus_structure": "(TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 3", @@ -3155,8 +3196,22 @@ "unknown_motif_gene_orientation": [], "disease": "Progressive external ophthalmoplegia, Parkinson's disease", "gene": "POLG", - "flank_motif": null, - "locus_structure": "(GCT)*GTT(GCT)*", + "locus_structure": [ + { + "motif": "GCT", + "count": 2, + "type": "flank_repeat" + }, + { + "motif": "GTT", + "count": 1, + "type": "interruption" + }, + { + "motif": "GCT", + "count": null, + "type": "pathogenic_repeat" + }], "inheritance": [], "type": "Coding", "location_in_gene": "Exon 2", @@ -3219,8 +3274,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia type 12", "gene": "PPP2R2B", - "flank_motif": null, - "locus_structure": "(GCT)*", + "locus_structure": [], "inheritance": ["AD"], "type": "5' UTR", "location_in_gene": null, @@ -3283,8 +3337,7 @@ "unknown_motif_gene_orientation": [], "disease": "Hereditary sensory and autonomic neuropathy type VIII", "gene": "PRDM12", - "flank_motif": null, - "locus_structure": "(GCC)*", + "locus_structure": [], "inheritance": ["AR"], "type": "Coding", "location_in_gene": "Exon 5", @@ -3347,8 +3400,17 @@ "unknown_motif_gene_orientation": [], "disease": "Creutzfeldt-Jakob disease", "gene": "PRNP", - "flank_motif": "(CCTCAGGGCGGTGGTGGCTGGGGGCAG)1(CCTCATGGTGGTGGCTGGGGGCAG)n", - "locus_structure": "(CCTCAGGGCGGTGGTGGCTGGGGGCAG)*(CCTCATGGTGGTGGCTGGGGGCAG)*", + "locus_structure": [ + { + "motif": "CCTCAGGGCGGTGGTGGCTGGGGGCAG", + "count": 1, + "type": "flank_repeat" + }, + { + "motif": "CCTCATGGTGGTGGCTGGGGGCAG", + "count": null, + "type": "pathogenic_repeat" + }], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 2", @@ -3411,8 +3473,7 @@ "unknown_motif_gene_orientation": ["GGGGT", "ATGGG"], "disease": "Familial adult myoclonic epilepsy type 8", "gene": "RAI1", - "flank_motif": null, - "locus_structure": "(ATTTT)*(TTTCA)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 4", @@ -3475,8 +3536,7 @@ "unknown_motif_gene_orientation": ["TTTTT", "ATGTT"], "disease": "Familial adult myoclonic epilepsy type 7", "gene": "RAPGEF2", - "flank_motif": null, - "locus_structure": "(TTTTA)*(TTTCA)*(TTTTA)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 14", @@ -3539,8 +3599,7 @@ "unknown_motif_gene_orientation": ["TTTTT", "GTTTT", "CCGTT", "CTTGT", "ACCTT", "CTGTT", "CCCCT", "CGTTT", "CCCGT", "ACCTC", "CTTTTT", "CCTTTT", "CCCTTT"], "disease": "Cerebellar ataxia, neuropathy, and vestibular areflexia syndrome", "gene": "RFC1", - "flank_motif": null, - "locus_structure": "(AAGGG)*(ACAGG)*", + "locus_structure": [], "inheritance": ["AR"], "type": "Intronic", "location_in_gene": "Intron 2", @@ -3603,8 +3662,7 @@ "unknown_motif_gene_orientation": [], "disease": "Oculopharyngodistal myopathy type 4", "gene": "RILPL1", - "flank_motif": null, - "locus_structure": "(GGC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "5' UTR", "location_in_gene": null, @@ -3667,8 +3725,7 @@ "unknown_motif_gene_orientation": [], "disease": "Cleidocranial dysplasia", "gene": "RUNX2", - "flank_motif": null, - "locus_structure": "(GCN)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 3", @@ -3731,8 +3788,7 @@ "unknown_motif_gene_orientation": ["TTTTT", "AGTTT", "ATGTT", "ATTGT", "AGTGT"], "disease": "Familial adult myoclonic epilepsy type 1", "gene": "SAMD12", - "flank_motif": null, - "locus_structure": "(TAAAA)*(TGAAA)*(TAAAA)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 4/4", @@ -3795,8 +3851,7 @@ "unknown_motif_gene_orientation": [], "disease": "X-linked panhypopituitarism ; X-linked mental retardation with isolated growth hormone", "gene": "SOX3", - "flank_motif": null, - "locus_structure": "(NGC)*", + "locus_structure": [], "inheritance": ["XR"], "type": "Coding", "location_in_gene": "Exon 1", @@ -3859,8 +3914,7 @@ "unknown_motif_gene_orientation": ["TTTTT", "GTTTT", "GGTTT", "CGTTT", "AGTTT", "AGTTG", "AGTTC", "ATTGT", "ATTCT", "ATGTT"], "disease": "Familial adult myoclonic epilepsy 2", "gene": "STARD7", - "flank_motif": null, - "locus_structure": "(AAATG)*(AAAAT)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 1", @@ -3923,8 +3977,7 @@ "unknown_motif_gene_orientation": [], "disease": "X-linked dystonia-parkinsonism (XDP) a.k.a. Dystonia 3, torsion, X-linked (DYT3)", "gene": "TAF1", - "flank_motif": null, - "locus_structure": "(AGAGGG)*", + "locus_structure": [], "inheritance": ["XR"], "type": "Intronic", "location_in_gene": "Intron 32", @@ -3987,8 +4040,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia type 17", "gene": "TBP", - "flank_motif": null, - "locus_structure": "(GCA)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 3", @@ -4051,8 +4103,7 @@ "unknown_motif_gene_orientation": [], "disease": "Tetralogy of Fallot", "gene": "TBX1", - "flank_motif": null, - "locus_structure": "(GCN)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 9", @@ -4115,8 +4166,7 @@ "unknown_motif_gene_orientation": [], "disease": "Fuchs endothelial corneal dystrophy 3", "gene": "TCF4", - "flank_motif": null, - "locus_structure": "(CAG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 1", @@ -4179,8 +4229,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia", "gene": "THAP11", - "flank_motif": null, - "locus_structure": "(CAG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 1", @@ -4243,8 +4292,7 @@ "unknown_motif_gene_orientation": ["TTTTT"], "disease": "Familial adult myoclonic epilepsy type 6", "gene": "TNRC6A", - "flank_motif": null, - "locus_structure": "(TTTTA)*(TTTCA)*(TTTTA)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Exon 1", @@ -4307,8 +4355,7 @@ "unknown_motif_gene_orientation": [], "disease": "Congenital Progressive Universal Melanosis", "gene": "TYMS", - "flank_motif": null, - "locus_structure": "(GATGGT)*", + "locus_structure": [], "inheritance": ["AR"], "type": "Non-coding", "location_in_gene": "Intron 3", @@ -4371,8 +4418,7 @@ "unknown_motif_gene_orientation": [], "disease": "Neuronopathy, distal hereditary motor, autosomal recessive 7", "gene": "VWA1", - "flank_motif": null, - "locus_structure": "(GGCGCGGAGC)*", + "locus_structure": [], "inheritance": ["AR"], "type": "Coding", "location_in_gene": "Exon 1", @@ -4435,8 +4481,22 @@ "unknown_motif_gene_orientation": [], "disease": "Baratela-Scott Syndrome/Desbuquois dysplasia 2", "gene": "XYLT1", - "flank_motif": null, - "locus_structure": "(GCC)*", + "locus_structure": [ + { + "motif": "GCC", + "count": 7, + "type": "internal_repeat" + }, + { + "motif": "TCGGCTCGCCGCTGCTCCTCCTCC", + "count": 1, + "type": "interruption" + }, + { + "motif": "GCC", + "count": null, + "type": "pathogenic_repeat" + }], "inheritance": ["AR"], "type": "5' UTR", "location_in_gene": "Intron 1", @@ -4499,8 +4559,7 @@ "unknown_motif_gene_orientation": ["TTTTT", "ATGTT"], "disease": "Familial adult myoclonic epilepsy 4", "gene": "YEATS2", - "flank_motif": null, - "locus_structure": "(TTTTA)*(TTTCA)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 1", @@ -4563,8 +4622,7 @@ "unknown_motif_gene_orientation": [], "disease": "Spinocerebellar ataxia 4", "gene": "ZFHX3", - "flank_motif": null, - "locus_structure": "(GCC)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Last Exon (exact exon transcript dependent)", @@ -4627,8 +4685,7 @@ "unknown_motif_gene_orientation": [], "disease": "Holoprosencephaly-5", "gene": "ZIC2", - "flank_motif": null, - "locus_structure": "(GCN)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Coding", "location_in_gene": "Exon 3", @@ -4691,8 +4748,7 @@ "unknown_motif_gene_orientation": [], "disease": "X-linked VACTERL syndrome", "gene": "ZIC3", - "flank_motif": null, - "locus_structure": "(GCN)*", + "locus_structure": [], "inheritance": ["XR"], "type": "Coding", "location_in_gene": "Exon 1", @@ -4755,8 +4811,7 @@ "unknown_motif_gene_orientation": [], "disease": "Autism spectrum disorder associated with fragile site FRA7A", "gene": "ZNF713", - "flank_motif": null, - "locus_structure": "(GCG)*", + "locus_structure": [], "inheritance": ["AD"], "type": "Intronic", "location_in_gene": "Intron 1", diff --git a/data/STRchive-loci.schema.json b/data/STRchive-loci.schema.json index cdc6a287..d1b593d0 100644 --- a/data/STRchive-loci.schema.json +++ b/data/STRchive-loci.schema.json @@ -109,14 +109,43 @@ "description": "Gene symbol e.g. RFC1", "type": [ "string", "null" ] }, - "flank_motif": { - "description": "Structure of the locus with any flanking motifs. The pathogenic locus should be indicated with n copies.", - "examples": [ "(CAG)nCAACAG(CCG)12" ], - "type": [ "string", "null" ] - }, "locus_structure": { - "description": "Structure of the locus including the order of motifs and any interruptions e.g. (CAG)*CAACAG(CCG)*", - "type": [ "string", "null" ] + "description": "Structure of the locus including the order of motifs and any interruptions. Only one motif can have a null count.", + "type": "array", + "items": { + "type": "object", + "properties": { + "motif": { + "type": [ "string", "null" ], + "pattern": "^[ACGTN]+$" + }, + "count": { + "type": [ "integer", "null" ], + "minimum": 0 + }, + "type": { + "anyOf": [ + { + "const": "pathogenic_repeat", + "title": "The pathogenic_repeat will typically have a count of null." + }, + { + "const": "flank_repeat", + "title": "Flanking repeat. Must be outside the locus coordinates. The order in the json will be used to determine the location. For example, if it occurs before the pathogenic repeat, it will be considered 5' flanking. These are expected to be present in the reference genome." + }, + { + "const": "internal_repeat", + "title": "A repeat that is located within the locus coordinates. It may occur in only some alleles and may be absent from the reference genome." + }, + { + "const": "interruption", + "title": "A non-repetitive interruption in the repeat structure, e.g. an insertion that doesn't match the surrounding motif. Must be within the locus coordinates." + } + ] + } + }, + "required": [ "motif", "count", "type" ] + } }, "inheritance": { "description": "Inheritance pattern(s) of the disease e.g. AD for autosomal dominant", @@ -415,7 +444,6 @@ "unknown_motif_gene_orientation", "disease", "gene", - "flank_motif", "locus_structure", "inheritance", "location_in_gene", diff --git a/data/catalogs/README.md b/data/catalogs/README.md new file mode 100644 index 00000000..3009b5ec --- /dev/null +++ b/data/catalogs/README.md @@ -0,0 +1,45 @@ +# STRchive Catalogs + +This directory contains various genotyping and annotation catalogs and based on STRchive tandem repeat loci. + +**CAVEATS:** +- Some of these files are still in active development and should be used with care. The specific coordinates and motifs chosen can affect genotyping accuracy. +- Information about the overall pathogenicity of a locus and about specific ranges and motifs is provided as our best estimate. All information should be verified. + +## Reference Genomes + +- hg38 is the "default" reference from which the others are derived +- hg19 +- CHM13-T2T + +## Genotypers and File Descriptions + +File format: +`STRchive-disease-loci.[reference genome].[software].[file extension(s) e.g. bed, json, bed.gz]` + +### TRGT +- `STRchive-disease-loci.hg19.TRGT.bed` + +### STRanger +- `STRchive-disease-loci.hg38.stranger.json` + +This file is designed to work with the [wf-human-variation workflow](https://github.com/epi2me-labs/wf-human-variation/tree/master). It is modeled after this file: [variant_catalog_hg38.json](https://github.com/epi2me-labs/wf-human-variation/blob/master/data/variant_catalog_hg38.json). It should be used with a matching STRagler bed file where the IDs and start coordinates match. + +**WARNING:** + +STRanger requires values for "NormalMax" and "PathologicMin". For some loci these values may be missing from STRchive because they could not be verified from the literature. In cases where one of these values is missing it will be inferred as such: +PathologicMin = NormalMax + 1 +NormalMax = PathologicMin - 1 + +If both values are missing from STRchive the locus will not be included in this file (e.g. where pathogenicity is caused by motif change, not allele size). + +### Atarva + +- `STRchive-disease-loci.hg38.atarva.bed.gz` +- `STRchive-disease-loci.hg38.atarva.bed.gz.tbi` + +Additionally this file is included as the source and to track changes, but is not used by atarva: +- `STRchive-disease-loci.hg38.atarva.bed` + +### Expansion Hunter +- `STRchive-disease-loci.hg19.expansionhunter.json` diff --git a/data/STRchive-disease-loci.T2T-chm13.TRGT.bed b/data/catalogs/STRchive-disease-loci.T2T-chm13.TRGT.bed similarity index 80% rename from data/STRchive-disease-loci.T2T-chm13.TRGT.bed rename to data/catalogs/STRchive-disease-loci.T2T-chm13.TRGT.bed index 18112b04..9a801c1d 100644 --- a/data/STRchive-disease-loci.T2T-chm13.TRGT.bed +++ b/data/catalogs/STRchive-disease-loci.T2T-chm13.TRGT.bed @@ -8,15 +8,15 @@ chr2 96703674 96703732 ID=FAME2_STARD7;MOTIFS=AAATG,AAAAT;STRUC= chr2 100563685 100563738 ID=FRA2A_AFF3;MOTIFS=GCC;STRUC= chr2 176581179 176581224 ID=SD5_HOXD13;MOTIFS=GCN;STRUC= chr2 191369982 191370024 ID=GDPAG_GLS;MOTIFS=GCA;STRUC= -chr3 63956302 63956345 ID=SCA7_ATXN7;MOTIFS=CAG,CCG;STRUC= -chr3 131917482 131917635 ID=DM2_CNBP;MOTIFS=CAGG,CAGA,CA;STRUC= +chr3 63956302 63956333 ID=SCA7_ATXN7;MOTIFS=CAG,CCG;STRUC= +chr3 131917482 131917557 ID=DM2_CNBP;MOTIFS=CAGG,CAGA,CA;STRUC= chr3 141687011 141687054 ID=BPES_FOXL2;MOTIFS=NGC;STRUC= -chr3 186521667 186521706 ID=FAME4_YEATS2;MOTIFS=TTTTA,TTTCA;STRUC= -chr4 3073603 3073723 ID=HD_HTT;MOTIFS=CAG,CCG;STRUC= +chr3 186521667 186521706 ID=FAME4_YEATS2;MOTIFS=TTTCA,TTTTA;STRUC= +chr4 3073603 3073687 ID=HD_HTT;MOTIFS=CAG,CCG;STRUC= chr4 39318077 39318136 ID=CANVAS_RFC1;MOTIFS=AAGGG,ACAGG,AGGGC,AAGGC,AGAGG,AAAAG,AAAGG,AAGAG,AAAGGG;STRUC= chr4 41719745 41719805 ID=CCHS_PHOX2B;MOTIFS=GCN;STRUC= -chr4 162693303 162693405 ID=FAME7_RAPGEF2;MOTIFS=TTTTA,TTTCA;STRUC= -chr5 10295525 10295593 ID=FAME3_MARCHF6;MOTIFS=TTTTA,TTTCA;STRUC= +chr4 162693303 162693405 ID=FAME7_RAPGEF2;MOTIFS=TTTCA,TTTTA;STRUC= +chr5 10295525 10295593 ID=FAME3_MARCHF6;MOTIFS=TTTCA,TTTTA;STRUC= chr5 147414733 147414780 ID=SCA12_PPP2R2B;MOTIFS=GCT;STRUC= chr6 16200188 16200282 ID=SCA1_ATXN1;MOTIFS=CTG;STRUC= chr6 45257567 45257618 ID=CCD_RUNX2;MOTIFS=GCN;STRUC= @@ -26,9 +26,9 @@ chr7 27335813 27335849 ID=HFG_HOXA13-II;MOTIFS=NGC;STRUC= chr7 27335912 27335954 ID=HFG_HOXA13-I;MOTIFS=NGC;STRUC= chr7 56047900 56047939 ID=FRA7A_ZNF713;MOTIFS=GCG;STRUC= chr8 105716409 105716441 ID=OPDM1_LRP12;MOTIFS=CGC;STRUC= -chr8 119495247 119495353 ID=FAME1_SAMD12;MOTIFS=TAAAA,TGAAA;STRUC= +chr8 119495247 119495353 ID=FAME1_SAMD12;MOTIFS=TGAAA,TAAAA;STRUC= chr9 27584063 27584155 ID=FTDALS1_C9orf72;MOTIFS=GGCCCC;STRUC= -chr9 81210818 81210861 ID=FRDA_FXN;MOTIFS=A,GAA;STRUC= +chr9 81210834 81210861 ID=FRDA_FXN;MOTIFS=A,GAA;STRUC= chr9 142886568 142886595 ID=HSAN-VIII_PRDM12;MOTIFS=GCC;STRUC= chr10 80695718 80695748 ID=OPML1_NUTM2B-AS1;MOTIFS=GGC;STRUC= chr11 119226662 119226696 ID=JBS_CBL;MOTIFS=CGG;STRUC= @@ -36,7 +36,7 @@ chr12 6947903 6947941 ID=DRPLA_ATN1;MOTIFS=CAG;STRUC= chr12 50468095 50468118 ID=FRA12A_DIP2B;MOTIFS=GGC;STRUC= chr12 111575873 111575940 ID=SCA2_ATXN2;MOTIFS=CTG;STRUC= chr12 123532573 123532603 ID=OPDM4_RILPL1;MOTIFS=GGC;STRUC= -chr13 69361213 69361270 ID=SCA8_ATXN8OS;MOTIFS=CTA,CTG;STRUC= +chr13 69361243 69361270 ID=SCA8_ATXN8OS;MOTIFS=CTA,CTG;STRUC= chr13 99196358 99196404 ID=HPE5_ZIC2;MOTIFS=GCN;STRUC= chr13 101377549 101377792 ID=SCA27B_FGF14;MOTIFS=GAA,GAAGGA,GAAGAAGAAGAAGCA,AAGGAG;STRUC= chr14 17522488 17522519 ID=OPMD_PABPN1;MOTIFS=GCN;STRUC= @@ -45,13 +45,13 @@ chr15 20458510 20458536 ID=ALS1_NIPA1;MOTIFS=GCG;STRUC= chr15 86324038 86324057 ID=MIR7-2_CHNG3;MOTIFS=TTTG;STRUC= chr15 87088411 87088452 ID=CPEO_POLG;MOTIFS=GCT;STRUC= chr16 17477909 17478002 ID=DBQD2_XYLT1;MOTIFS=GCC;STRUC= -chr16 24890366 24890430 ID=FAME6_TNRC6A;MOTIFS=TTTTA,TTTCA;STRUC= +chr16 24890366 24890430 ID=FAME6_TNRC6A;MOTIFS=TTTCA,TTTTA;STRUC= chr16 72284666 72284761 ID=SCA31_BEAN1;MOTIFS=TGGAA,TAGAA,AATAA;STRUC= chr16 73638636 73638724 ID=SCA_THAP11;MOTIFS=CAG;STRUC= chr16 78605502 78605569 ID=SCA4_ZFHX3;MOTIFS=GCC;STRUC= chr16 93675723 93675776 ID=HDL2_JPH3;MOTIFS=CTG;STRUC= -chr17 17754961 17755053 ID=FAME8_RAI1;MOTIFS=ATTTT,TTTCA,TTTTA;STRUC= -chr17 81047404 81047534 ID=RCPS_EIF4A3;MOTIFS=CCTCGCTGCGCCGCTGCCGA,CCTCGCTGTGCCGCTGCCGA;STRUC= +chr17 17754961 17755053 ID=FAME8_RAI1;MOTIFS=TTTCA,TTTTA;STRUC= +chr17 81047404 81047534 ID=RCPS_EIF4A3;MOTIFS=CCTCGCTGTGCCGCTGCCGA;STRUC= chr18 821235 821905 ID=CPUM_TYMS;MOTIFS=GATGGT;STRUC= chr18 55789233 55789288 ID=FECD3_TCF4;MOTIFS=CAG;STRUC= chr19 4494212 4497342 ID=MRUPAV_PLIN4;MOTIFS=TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC;STRUC= @@ -59,14 +59,14 @@ chr19 13333136 13333176 ID=SCA6_CACNA1A;MOTIFS=CTG;STRUC= chr19 14622655 14622692 ID=OPDM2_GIPC1;MOTIFS=CCG;STRUC= chr19 18921630 18921645 ID=EDM1-PSACH_COMP;MOTIFS=GTC;STRUC= chr19 48597739 48597756 ID=DM1_DMPK;MOTIFS=CAG;STRUC= -chr20 2683189 2683248 ID=SCA36_NOP56;MOTIFS=GGCCTG,CGCCTG;STRUC= -chr20 4738606 4738705 ID=CJD_PRNP;MOTIFS=CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT;STRUC= +chr20 2683189 2683230 ID=SCA36_NOP56;MOTIFS=GGCCTG,CGCCTG;STRUC= +chr20 4738633 4738705 ID=CJD_PRNP;MOTIFS=CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT;STRUC= chr21 42132054 42132091 ID=EPM1_CSTB;MOTIFS=CGCGGGGCGGGG;STRUC= chr22 20143615 20143660 ID=TOF_TBX1;MOTIFS=GCN;STRUC= chr22 46280059 46280134 ID=SCA10_ATXN10;MOTIFS=ATTCT;STRUC= chrX 24597766 24597802 ID=PRTS_ARX;MOTIFS=NGC;STRUC= chrX 24597886 24597934 ID=EIEE1_ARX;MOTIFS=NGC;STRUC= -chrX 30882677 30882751 ID=DMD_DMD;MOTIFS=TTC,T;STRUC= +chrX 30882677 30882743 ID=DMD_DMD;MOTIFS=TTC,T;STRUC= chrX 65975147 65975250 ID=SBMA_AR;MOTIFS=GCA;STRUC= chrX 69887153 69887230 ID=XDP_TAF1;MOTIFS=AGAGGG;STRUC= chrX 135876774 135876804 ID=VACTERLX_ZIC3;MOTIFS=GCN;STRUC= diff --git a/data/catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed b/data/catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed new file mode 100644 index 00000000..22aa5ab1 --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed @@ -0,0 +1,88 @@ +#chrom start stop motif motif_len id +chr1 870158 870178 GGCGCGGAGC 10 HMNR7_VWA1 +chr1 57245900 57245935 AAAAT 5 SCA37_DAB1_flank +chr1 57245935 57245973 GAAAT 5 SCA37_DAB1 +chr1 94266544 94266567 GCC 3 OPDM5_ABCD3 +chr1 148519695 148519738 GGC 3 NIID_NOTCH2NLC +chr1 154328121 154330802 GCCCACGGTGTCACCTCGGCCCCGGACACCAGGCCGGCCCCGGGCTCCACCGCCCCCCCCA 61 ADTKD_MUC1 +chr1 155728131 155728159 GGGCC 5 NME_NAXE +chr2 96703674 96703732 AAATG 5 FAME2_STARD7 +chr2 100563685 100563738 GCC 3 FRA2A_AFF3 +chr2 176581179 176581224 GCN 3 SD5_HOXD13 +chr2 191369982 191370024 GCA 3 GDPAG_GLS +chr3 63956302 63956333 CAG 3 SCA7_ATXN7 +chr3 63956333 63956345 CCG 3 SCA7_ATXN7_flank +chr3 131917482 131917557 CAGG 4 DM2_CNBP +chr3 131917557 131917597 CAGA 4 DM2_CNBP_flank +chr3 131917597 131917635 CA 2 DM2_CNBP_flank +chr3 141687011 141687054 NGC 3 BPES_FOXL2 +chr3 186521667 186521706 TTTCA 5 FAME4_YEATS2 +chr4 3073603 3073687 CAG 3 HD_HTT +chr4 3073693 3073729 CCG 3 HD_HTT_flank +chr4 39318077 39318136 AAGGG 5 CANVAS_RFC1 +chr4 41719745 41719805 GCN 3 CCHS_PHOX2B +chr4 162693303 162693405 TTTCA 5 FAME7_RAPGEF2 +chr5 10295525 10295593 TTTCA 5 FAME3_MARCHF6 +chr5 147414733 147414780 GCT 3 SCA12_PPP2R2B +chr6 16200188 16200282 CTG 3 SCA1_ATXN1 +chr6 45257567 45257618 GCN 3 CCD_RUNX2 +chr6 171935458 171935569 GCA 3 SCA17_TBP +chr7 27335684 27335720 NGC 3 HFG_HOXA13-III +chr7 27335813 27335849 NGC 3 HFG_HOXA13-II +chr7 27335912 27335954 NGC 3 HFG_HOXA13-I +chr7 56047900 56047939 GCG 3 FRA7A_ZNF713 +chr8 105716409 105716441 CGC 3 OPDM1_LRP12 +chr8 119495247 119495353 TGAAA 5 FAME1_SAMD12 +chr9 27584063 27584155 GGCCCC 6 FTDALS1_C9orf72 +chr9 81210818 81210834 A 1 FRDA_FXN_flank +chr9 81210834 81210861 GAA 3 FRDA_FXN +chr9 142886568 142886595 GCC 3 HSAN-VIII_PRDM12 +chr10 80695718 80695748 GGC 3 OPML1_NUTM2B-AS1 +chr11 119226662 119226696 CGG 3 JBS_CBL +chr12 6947903 6947941 CAG 3 DRPLA_ATN1 +chr12 50468095 50468118 GGC 3 FRA12A_DIP2B +chr12 111575873 111575940 CTG 3 SCA2_ATXN2 +chr12 123532573 123532603 GGC 3 OPDM4_RILPL1 +chr13 69361213 69361243 CTA 3 SCA8_ATXN8OS_flank +chr13 69361243 69361270 CTG 3 SCA8_ATXN8OS +chr13 99196358 99196404 GCN 3 HPE5_ZIC2 +chr13 101377549 101377792 GAA 3 SCA27B_FGF14 +chr14 17522488 17522519 GCN 3 OPMD_PABPN1 +chr14 86300519 86300603 CTG 3 SCA3_ATXN3 +chr15 20458510 20458536 GCG 3 ALS1_NIPA1 +chr15 86324038 86324057 TTTG 4 MIR7-2_CHNG3 +chr15 87088402 87088408 GCT 3 CPEO_POLG_flank +chr15 87088411 87088452 GCT 3 CPEO_POLG +chr16 17477864 17477885 GCC 3 DBQD2_XYLT1_flank +chr16 17477909 17478002 GCC 3 DBQD2_XYLT1 +chr16 24890366 24890430 TTTCA 5 FAME6_TNRC6A +chr16 72284666 72284761 TGGAA 5 SCA31_BEAN1 +chr16 73638636 73638724 CAG 3 SCA_THAP11 +chr16 78605502 78605569 GCC 3 SCA4_ZFHX3 +chr16 93675723 93675776 CTG 3 HDL2_JPH3 +chr17 17754961 17755053 TTTCA 5 FAME8_RAI1 +chr17 81047404 81047534 CCTCGCTGTGCCGCTGCCGA 20 RCPS_EIF4A3 +chr18 821235 821905 GATGGT 6 CPUM_TYMS +chr18 55789233 55789288 CAG 3 FECD3_TCF4 +chr19 4494212 4497342 TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC 99 MRUPAV_PLIN4 +chr19 13333136 13333176 CTG 3 SCA6_CACNA1A +chr19 14622655 14622692 CCG 3 OPDM2_GIPC1 +chr19 18921630 18921645 GTC 3 EDM1-PSACH_COMP +chr19 48597739 48597756 CAG 3 DM1_DMPK +chr20 2683189 2683230 GGCCTG 6 SCA36_NOP56 +chr20 2683230 2683248 CGCCTG 6 SCA36_NOP56_flank +chr20 4738606 4738633 CCTCAGGGCGGTGGTGGCTGGGGGCAG 27 CJD_PRNP_flank +chr20 4738633 4738705 CCTCATGGTGGTGGCTGGGGGCAG 24 CJD_PRNP +chr21 42132054 42132091 CGCGGGGCGGGG 12 EPM1_CSTB +chr22 20143615 20143660 GCN 3 TOF_TBX1 +chr22 46280059 46280134 ATTCT 5 SCA10_ATXN10 +chrX 24597766 24597802 NGC 3 PRTS_ARX +chrX 24597886 24597934 NGC 3 EIEE1_ARX +chrX 30882677 30882743 TTC 3 DMD_DMD +chrX 30882743 30882751 T 1 DMD_DMD_flank +chrX 65975147 65975250 GCA 3 SBMA_AR +chrX 69887153 69887230 AGAGGG 6 XDP_TAF1 +chrX 135876774 135876804 GCN 3 VACTERLX_ZIC3 +chrX 138816203 138816248 NGC 3 XLMR_SOX3 +chrX 146176677 146176769 CGG 3 FXS_FMR1 +chrX 146765190 146765342 GCC 3 FRAXE_AFF2 diff --git a/data/catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed.gz b/data/catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed.gz new file mode 100644 index 00000000..3975c33c Binary files /dev/null and b/data/catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed.gz differ diff --git a/data/catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed.gz.tbi b/data/catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed.gz.tbi new file mode 100644 index 00000000..39d95f48 Binary files /dev/null and b/data/catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed.gz.tbi differ diff --git a/data/STRchive-disease-loci.T2T-chm13.bed b/data/catalogs/STRchive-disease-loci.T2T-chm13.general.bed similarity index 100% rename from data/STRchive-disease-loci.T2T-chm13.bed rename to data/catalogs/STRchive-disease-loci.T2T-chm13.general.bed diff --git a/data/catalogs/STRchive-disease-loci.T2T-chm13.longTR.bed b/data/catalogs/STRchive-disease-loci.T2T-chm13.longTR.bed new file mode 100644 index 00000000..48ea05cf --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.T2T-chm13.longTR.bed @@ -0,0 +1,75 @@ +chr1 870159 870178 GGCGCGGAGC HMNR7_VWA1 +chr1 57245936 57245973 GAAAT,AAAAT SCA37_DAB1 +chr1 94266545 94266567 GCC OPDM5_ABCD3 +chr1 148519696 148519738 GGC NIID_NOTCH2NLC +chr1 154328122 154330802 GCCCACGGTGTCACCTCGGCCCCGGACACCAGGCCGGCCCCGGGCTCCACCGCCCCCCCCA,GCCCACGGTGTCACCTCGGCCCCGGACACCAGGCCGGCCCCGGGCTCCACCGCCCCCCCA,GGCTNNGGGNGCGGTGGAGCCCGGGGCNGGNCTGNTNTCCGGGGCCGAGGTGACANCNTG ADTKD_MUC1 +chr1 155728132 155728159 GGGCC NME_NAXE +chr2 96703675 96703732 AAATG,AAAAT FAME2_STARD7 +chr2 100563686 100563738 GCC FRA2A_AFF3 +chr2 176581180 176581224 GCN SD5_HOXD13 +chr2 191369983 191370024 GCA GDPAG_GLS +chr3 63956303 63956333 CAG SCA7_ATXN7 +chr3 131917483 131917557 CAGG DM2_CNBP +chr3 141687012 141687054 NGC BPES_FOXL2 +chr3 186521668 186521706 TTTCA,TTTTA FAME4_YEATS2 +chr4 3073604 3073687 CAG HD_HTT +chr4 39318078 39318136 AAGGG,ACAGG,AGGGC,AAGGC,AGAGG,AAAAG,AAAGG,AAGAG,AAAGGG CANVAS_RFC1 +chr4 41719746 41719805 GCN CCHS_PHOX2B +chr4 162693304 162693405 TTTCA,TTTTA FAME7_RAPGEF2 +chr5 10295526 10295593 TTTCA,TTTTA FAME3_MARCHF6 +chr5 147414734 147414780 GCT SCA12_PPP2R2B +chr6 16200189 16200282 CTG SCA1_ATXN1 +chr6 45257568 45257618 GCN CCD_RUNX2 +chr6 171935459 171935569 GCA SCA17_TBP +chr7 27335685 27335720 NGC HFG_HOXA13-III +chr7 27335814 27335849 NGC HFG_HOXA13-II +chr7 27335913 27335954 NGC HFG_HOXA13-I +chr7 56047901 56047939 GCG FRA7A_ZNF713 +chr8 105716410 105716441 CGC OPDM1_LRP12 +chr8 119495248 119495353 TGAAA,TAAAA FAME1_SAMD12 +chr9 27584064 27584155 GGCCCC FTDALS1_C9orf72 +chr9 81210835 81210861 GAA FRDA_FXN +chr9 142886569 142886595 GCC HSAN-VIII_PRDM12 +chr10 80695719 80695748 GGC OPML1_NUTM2B-AS1 +chr11 119226663 119226696 CGG JBS_CBL +chr12 6947904 6947941 CAG DRPLA_ATN1 +chr12 50468096 50468118 GGC FRA12A_DIP2B +chr12 111575874 111575940 CTG SCA2_ATXN2 +chr12 123532574 123532603 GGC OPDM4_RILPL1 +chr13 69361244 69361270 CTG SCA8_ATXN8OS +chr13 99196359 99196404 GCN HPE5_ZIC2 +chr13 101377550 101377792 GAA,GAAGGA,GAAGAAGAAGAAGCA,AAGGAG SCA27B_FGF14 +chr14 17522489 17522519 GCN OPMD_PABPN1 +chr14 86300520 86300603 CTG SCA3_ATXN3 +chr15 20458511 20458536 GCG ALS1_NIPA1 +chr15 86324039 86324057 TTTG MIR7-2_CHNG3 +chr15 87088412 87088452 GCT CPEO_POLG +chr16 17477910 17478002 GCC DBQD2_XYLT1 +chr16 24890367 24890430 TTTCA,TTTTA FAME6_TNRC6A +chr16 72284667 72284761 TGGAA,TAGAA,AATAA SCA31_BEAN1 +chr16 73638637 73638724 CAG SCA_THAP11 +chr16 78605503 78605569 GCC SCA4_ZFHX3 +chr16 93675724 93675776 CTG HDL2_JPH3 +chr17 17754962 17755053 TTTCA,TTTTA FAME8_RAI1 +chr17 81047405 81047534 CCTCGCTGTGCCGCTGCCGA RCPS_EIF4A3 +chr18 821236 821905 GATGGT CPUM_TYMS +chr18 55789234 55789288 CAG FECD3_TCF4 +chr19 4494213 4497342 TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC MRUPAV_PLIN4 +chr19 13333137 13333176 CTG SCA6_CACNA1A +chr19 14622656 14622692 CCG OPDM2_GIPC1 +chr19 18921631 18921645 GTC EDM1-PSACH_COMP +chr19 48597740 48597756 CAG DM1_DMPK +chr20 2683190 2683230 GGCCTG SCA36_NOP56 +chr20 4738634 4738705 CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT CJD_PRNP +chr21 42132055 42132091 CGCGGGGCGGGG EPM1_CSTB +chr22 20143616 20143660 GCN TOF_TBX1 +chr22 46280060 46280134 ATTCT SCA10_ATXN10 +chrX 24597767 24597802 NGC PRTS_ARX +chrX 24597887 24597934 NGC EIEE1_ARX +chrX 30882678 30882743 TTC DMD_DMD +chrX 65975148 65975250 GCA SBMA_AR +chrX 69887154 69887230 AGAGGG XDP_TAF1 +chrX 135876775 135876804 GCN VACTERLX_ZIC3 +chrX 138816204 138816248 NGC XLMR_SOX3 +chrX 146176678 146176769 CGG FXS_FMR1 +chrX 146765191 146765342 GCC FRAXE_AFF2 diff --git a/data/catalogs/STRchive-disease-loci.T2T-chm13.straglr.bed b/data/catalogs/STRchive-disease-loci.T2T-chm13.straglr.bed new file mode 100644 index 00000000..d99575b2 --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.T2T-chm13.straglr.bed @@ -0,0 +1,86 @@ +chr1 870158 870178 GGCGCGGAGC HMNR7_VWA1 HMNR7_VWA1 +chr1 57245900 57245935 AAAAT SCA37_DAB1 SCA37_DAB1_AAAAT +chr1 57245935 57245973 GAAAT SCA37_DAB1 SCA37_DAB1 +chr1 94266544 94266567 GCC OPDM5_ABCD3 OPDM5_ABCD3 +chr1 148519695 148519738 GGC NIID_NOTCH2NLC NIID_NOTCH2NLC +chr1 155728131 155728159 GGGCC NME_NAXE NME_NAXE +chr2 96703674 96703732 AAATG FAME2_STARD7 FAME2_STARD7 +chr2 100563685 100563738 GCC FRA2A_AFF3 FRA2A_AFF3 +chr2 176581179 176581224 GCN SD5_HOXD13 SD5_HOXD13 +chr2 191369982 191370024 GCA GDPAG_GLS GDPAG_GLS +chr3 63956302 63956333 CAG SCA7_ATXN7 SCA7_ATXN7 +chr3 63956333 63956345 CCG SCA7_ATXN7 SCA7_ATXN7_CCG +chr3 131917482 131917557 CAGG DM2_CNBP DM2_CNBP +chr3 131917557 131917597 CAGA DM2_CNBP DM2_CNBP_CAGA +chr3 131917597 131917635 CA DM2_CNBP DM2_CNBP_CA +chr3 141687011 141687054 NGC BPES_FOXL2 BPES_FOXL2 +chr3 186521667 186521706 TTTCA FAME4_YEATS2 FAME4_YEATS2 +chr4 3073603 3073687 CAG HD_HTT HD_HTT +chr4 3073693 3073729 CCG HD_HTT HD_HTT_CCG +chr4 39318077 39318136 AAGGG CANVAS_RFC1 CANVAS_RFC1 +chr4 41719745 41719805 GCN CCHS_PHOX2B CCHS_PHOX2B +chr4 162693303 162693405 TTTCA FAME7_RAPGEF2 FAME7_RAPGEF2 +chr5 10295525 10295593 TTTCA FAME3_MARCHF6 FAME3_MARCHF6 +chr5 147414733 147414780 GCT SCA12_PPP2R2B SCA12_PPP2R2B +chr6 16200188 16200282 CTG SCA1_ATXN1 SCA1_ATXN1 +chr6 45257567 45257618 GCN CCD_RUNX2 CCD_RUNX2 +chr6 171935458 171935569 GCA SCA17_TBP SCA17_TBP +chr7 27335684 27335720 NGC HFG_HOXA13-III HFG_HOXA13-III +chr7 27335813 27335849 NGC HFG_HOXA13-II HFG_HOXA13-II +chr7 27335912 27335954 NGC HFG_HOXA13-I HFG_HOXA13-I +chr7 56047900 56047939 GCG FRA7A_ZNF713 FRA7A_ZNF713 +chr8 105716409 105716441 CGC OPDM1_LRP12 OPDM1_LRP12 +chr8 119495247 119495353 TGAAA FAME1_SAMD12 FAME1_SAMD12 +chr9 27584063 27584155 GGCCCC FTDALS1_C9orf72 FTDALS1_C9orf72 +chr9 81210818 81210834 A FRDA_FXN FRDA_FXN_A +chr9 81210834 81210861 GAA FRDA_FXN FRDA_FXN +chr9 142886568 142886595 GCC HSAN-VIII_PRDM12 HSAN-VIII_PRDM12 +chr10 80695718 80695748 GGC OPML1_NUTM2B-AS1 OPML1_NUTM2B-AS1 +chr11 119226662 119226696 CGG JBS_CBL JBS_CBL +chr12 6947903 6947941 CAG DRPLA_ATN1 DRPLA_ATN1 +chr12 50468095 50468118 GGC FRA12A_DIP2B FRA12A_DIP2B +chr12 111575873 111575940 CTG SCA2_ATXN2 SCA2_ATXN2 +chr12 123532573 123532603 GGC OPDM4_RILPL1 OPDM4_RILPL1 +chr13 69361213 69361243 CTA SCA8_ATXN8OS SCA8_ATXN8OS_CTA +chr13 69361243 69361270 CTG SCA8_ATXN8OS SCA8_ATXN8OS +chr13 99196358 99196404 GCN HPE5_ZIC2 HPE5_ZIC2 +chr13 101377549 101377792 GAA SCA27B_FGF14 SCA27B_FGF14 +chr14 17522488 17522519 GCN OPMD_PABPN1 OPMD_PABPN1 +chr14 86300519 86300603 CTG SCA3_ATXN3 SCA3_ATXN3 +chr15 20458510 20458536 GCG ALS1_NIPA1 ALS1_NIPA1 +chr15 86324038 86324057 TTTG MIR7-2_CHNG3 MIR7-2_CHNG3 +chr15 87088402 87088408 GCT CPEO_POLG CPEO_POLG_GCT +chr15 87088411 87088452 GCT CPEO_POLG CPEO_POLG +chr16 17477864 17477885 GCC DBQD2_XYLT1 DBQD2_XYLT1_GCC +chr16 17477909 17478002 GCC DBQD2_XYLT1 DBQD2_XYLT1 +chr16 24890366 24890430 TTTCA FAME6_TNRC6A FAME6_TNRC6A +chr16 72284666 72284761 TGGAA SCA31_BEAN1 SCA31_BEAN1 +chr16 73638636 73638724 CAG SCA_THAP11 SCA_THAP11 +chr16 78605502 78605569 GCC SCA4_ZFHX3 SCA4_ZFHX3 +chr16 93675723 93675776 CTG HDL2_JPH3 HDL2_JPH3 +chr17 17754961 17755053 TTTCA FAME8_RAI1 FAME8_RAI1 +chr17 81047404 81047534 CCTCGCTGTGCCGCTGCCGA RCPS_EIF4A3 RCPS_EIF4A3 +chr18 821235 821905 GATGGT CPUM_TYMS CPUM_TYMS +chr18 55789233 55789288 CAG FECD3_TCF4 FECD3_TCF4 +chr19 4494212 4497342 TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC MRUPAV_PLIN4 MRUPAV_PLIN4 +chr19 13333136 13333176 CTG SCA6_CACNA1A SCA6_CACNA1A +chr19 14622655 14622692 CCG OPDM2_GIPC1 OPDM2_GIPC1 +chr19 18921630 18921645 GTC EDM1-PSACH_COMP EDM1-PSACH_COMP +chr19 48597739 48597756 CAG DM1_DMPK DM1_DMPK +chr20 2683189 2683230 GGCCTG SCA36_NOP56 SCA36_NOP56 +chr20 2683230 2683248 CGCCTG SCA36_NOP56 SCA36_NOP56_CGCCTG +chr20 4738606 4738633 CCTCAGGGCGGTGGTGGCTGGGGGCAG CJD_PRNP CJD_PRNP_CCTCAGGGCGGTGGTGGCTGGGGGCAG +chr20 4738633 4738705 CCTCATGGTGGTGGCTGGGGGCAG CJD_PRNP CJD_PRNP +chr21 42132054 42132091 CGCGGGGCGGGG EPM1_CSTB EPM1_CSTB +chr22 20143615 20143660 GCN TOF_TBX1 TOF_TBX1 +chr22 46280059 46280134 ATTCT SCA10_ATXN10 SCA10_ATXN10 +chrX 24597766 24597802 NGC PRTS_ARX PRTS_ARX +chrX 24597886 24597934 NGC EIEE1_ARX EIEE1_ARX +chrX 30882677 30882743 TTC DMD_DMD DMD_DMD +chrX 30882743 30882751 T DMD_DMD DMD_DMD_T +chrX 65975147 65975250 GCA SBMA_AR SBMA_AR +chrX 69887153 69887230 AGAGGG XDP_TAF1 XDP_TAF1 +chrX 135876774 135876804 GCN VACTERLX_ZIC3 VACTERLX_ZIC3 +chrX 138816203 138816248 NGC XLMR_SOX3 XLMR_SOX3 +chrX 146176677 146176769 CGG FXS_FMR1 FXS_FMR1 +chrX 146765190 146765342 GCC FRAXE_AFF2 FRAXE_AFF2 diff --git a/data/catalogs/STRchive-disease-loci.T2T-chm13.stranger.json b/data/catalogs/STRchive-disease-loci.T2T-chm13.stranger.json new file mode 100644 index 00000000..d2961a87 --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.T2T-chm13.stranger.json @@ -0,0 +1,985 @@ +[ +{ + "LocusId": "HMNR7_VWA1", + "ReferenceRegion": "chr1:870158-870178", + "LocusStructure": "(GGCGCGGAGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GGCGCGGAGC", + "Disease": "HMNR7", + "NormalMax": 2.0, + "PathologicMin": 3.0, + "Gene": "VWA1" +}, +{ + "LocusId": "SCA37_DAB1", + "ReferenceRegion": ["chr1:57245900-57245935", "chr1:57245935-57245973"], + "LocusStructure": "(AAAAT)*(GAAAT)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA37_DAB1_AAAAT", "SCA37_DAB1"], + "PathologicRegion": "chr1:57245935-57245973", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GAAAT", + "Disease": "SCA37", + "NormalMax": 30.0, + "PathologicMin": 31.0, + "Gene": "DAB1" +}, +{ + "LocusId": "OPDM5_ABCD3", + "ReferenceRegion": "chr1:94266544-94266567", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCC", + "Disease": "OPDM5", + "NormalMax": 44.0, + "PathologicMin": 118.0, + "Gene": "ABCD3" +}, +{ + "LocusId": "NIID_NOTCH2NLC", + "ReferenceRegion": "chr1:148519695-148519738", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "NIID", + "NormalMax": 37.0, + "PathologicMin": 66.0, + "Gene": "NOTCH2NLC" +}, +{ + "LocusId": "NME_NAXE", + "ReferenceRegion": "chr1:155728131-155728159", + "LocusStructure": "(GGGCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GGGCC", + "Disease": "NME", + "NormalMax": 7.0, + "PathologicMin": 200.0, + "Gene": "NAXE" +}, +{ + "LocusId": "FAME2_STARD7", + "ReferenceRegion": "chr2:96703674-96703732", + "LocusStructure": "(AAATG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "AAATG", + "Disease": "FAME2", + "NormalMax": 273.0, + "PathologicMin": 274.0, + "Gene": "STARD7" +}, +{ + "LocusId": "FRA2A_AFF3", + "ReferenceRegion": "chr2:100563685-100563738", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCC", + "Disease": "FRA2A", + "NormalMax": 20.0, + "PathologicMin": 300.0, + "Gene": "AFF3" +}, +{ + "LocusId": "SD5_HOXD13", + "ReferenceRegion": "chr2:176581179-176581224", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "SD5", + "NormalMax": 15.0, + "PathologicMin": 22.0, + "Gene": "HOXD13" +}, +{ + "LocusId": "GDPAG_GLS", + "ReferenceRegion": "chr2:191369982-191370024", + "LocusStructure": "(GCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GCA", + "Disease": "GDPAG", + "NormalMax": 38.0, + "PathologicMin": 680.0, + "Gene": "GLS" +}, +{ + "LocusId": "SCA7_ATXN7", + "ReferenceRegion": ["chr3:63956302-63956333", "chr3:63956333-63956345"], + "LocusStructure": "(CAG)*(CCG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA7_ATXN7", "SCA7_ATXN7_CCG"], + "PathologicRegion": "chr3:63956302-63956333", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "SCA7", + "NormalMax": 27.0, + "PathologicMin": 37.0, + "Gene": "ATXN7" +}, +{ + "LocusId": "DM2_CNBP", + "ReferenceRegion": ["chr3:131917482-131917557", "chr3:131917557-131917597", "chr3:131917597-131917635"], + "LocusStructure": "(CAGG)*(CAGA)*(CA)*", + "VariantType": ["Repeat", "Repeat", "Repeat"], + "VariantId": ["DM2_CNBP", "DM2_CNBP_CAGA", "DM2_CNBP_CA"], + "PathologicRegion": "chr3:131917482-131917557", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAGG", + "Disease": "DM2", + "NormalMax": 26.0, + "PathologicMin": 75.0, + "Gene": "CNBP" +}, +{ + "LocusId": "BPES_FOXL2", + "ReferenceRegion": "chr3:141687011-141687054", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD", "AR"], + "DisplayRU": "NGC", + "Disease": "BPES", + "NormalMax": 14.0, + "PathologicMin": 15.0, + "Gene": "FOXL2" +}, +{ + "LocusId": "FAME4_YEATS2", + "ReferenceRegion": "chr3:186521667-186521706", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME4", + "NormalMax": 999.0, + "PathologicMin": 1000.0, + "Gene": "YEATS2" +}, +{ + "LocusId": "HD_HTT", + "ReferenceRegion": ["chr4:3073603-3073687", "chr4:3073693-3073729"], + "LocusStructure": "(CAG)*CAACAG(CCG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["HD_HTT", "HD_HTT_CCG"], + "PathologicRegion": "chr4:3073603-3073687", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "HD", + "NormalMax": 26.0, + "PathologicMin": 36.0, + "Gene": "HTT" +}, +{ + "LocusId": "CANVAS_RFC1", + "ReferenceRegion": "chr4:39318077-39318136", + "LocusStructure": "(AAGGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "AAGGG", + "Disease": "CANVAS", + "NormalMax": 11.0, + "PathologicMin": 400.0, + "Gene": "RFC1" +}, +{ + "LocusId": "CCHS_PHOX2B", + "ReferenceRegion": "chr4:41719745-41719805", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "CCHS", + "NormalMax": 20.0, + "PathologicMin": 26.0, + "Gene": "PHOX2B" +}, +{ + "LocusId": "FAME7_RAPGEF2", + "ReferenceRegion": "chr4:162693303-162693405", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME7", + "NormalMax": 59.0, + "PathologicMin": 60.0, + "Gene": "RAPGEF2" +}, +{ + "LocusId": "FAME3_MARCHF6", + "ReferenceRegion": "chr5:10295525-10295593", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME3", + "NormalMax": 790.0, + "PathologicMin": 791.0, + "Gene": "MARCHF6" +}, +{ + "LocusId": "SCA12_PPP2R2B", + "ReferenceRegion": "chr5:147414733-147414780", + "LocusStructure": "(GCT)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCT", + "Disease": "SCA12", + "NormalMax": 32.0, + "PathologicMin": 51.0, + "Gene": "PPP2R2B" +}, +{ + "LocusId": "SCA1_ATXN1", + "ReferenceRegion": "chr6:16200188-16200282", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA1", + "NormalMax": 35.0, + "PathologicMin": 39.0, + "Gene": "ATXN1" +}, +{ + "LocusId": "CCD_RUNX2", + "ReferenceRegion": "chr6:45257567-45257618", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "CCD", + "NormalMax": 17.0, + "PathologicMin": 20.0, + "Gene": "RUNX2" +}, +{ + "LocusId": "SCA17_TBP", + "ReferenceRegion": "chr6:171935458-171935569", + "LocusStructure": "(GCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCA", + "Disease": "SCA17", + "NormalMax": 40.0, + "PathologicMin": 49.0, + "Gene": "TBP" +}, +{ + "LocusId": "HFG_HOXA13-III", + "ReferenceRegion": "chr7:27335684-27335720", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "NGC", + "Disease": "HFG-III", + "NormalMax": 18.0, + "PathologicMin": 22.0, + "Gene": "HOXA13" +}, +{ + "LocusId": "HFG_HOXA13-II", + "ReferenceRegion": "chr7:27335813-27335849", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "NGC", + "Disease": "HFG-II", + "NormalMax": 12.0, + "PathologicMin": 18.0, + "Gene": "HOXA13" +}, +{ + "LocusId": "HFG_HOXA13-I", + "ReferenceRegion": "chr7:27335912-27335954", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "NGC", + "Disease": "HFG-I", + "NormalMax": 14.0, + "PathologicMin": 22.0, + "Gene": "HOXA13" +}, +{ + "LocusId": "FRA7A_ZNF713", + "ReferenceRegion": "chr7:56047900-56047939", + "LocusStructure": "(GCG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCG", + "Disease": "FRA7A", + "NormalMax": 22.0, + "PathologicMin": 450.0, + "Gene": "ZNF713" +}, +{ + "LocusId": "OPDM1_LRP12", + "ReferenceRegion": "chr8:105716409-105716441", + "LocusStructure": "(CGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CGC", + "Disease": "OPDM1", + "NormalMax": 45.0, + "PathologicMin": 85.0, + "Gene": "LRP12" +}, +{ + "LocusId": "FAME1_SAMD12", + "ReferenceRegion": "chr8:119495247-119495353", + "LocusStructure": "(TGAAA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TGAAA", + "Disease": "FAME1", + "NormalMax": 104.0, + "PathologicMin": 105.0, + "Gene": "SAMD12" +}, +{ + "LocusId": "FTDALS1_C9orf72", + "ReferenceRegion": "chr9:27584063-27584155", + "LocusStructure": "(GGCCCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGCCCC", + "Disease": "FTDALS1", + "NormalMax": 23.0, + "PathologicMin": 251.0, + "Gene": "C9orf72" +}, +{ + "LocusId": "FRDA_FXN", + "ReferenceRegion": ["chr9:81210818-81210834", "chr9:81210834-81210861"], + "LocusStructure": "(A)*(GAA)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["FRDA_FXN_A", "FRDA_FXN"], + "PathologicRegion": "chr9:81210834-81210861", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GAA", + "Disease": "FRDA", + "NormalMax": 33.0, + "PathologicMin": 56.0, + "Gene": "FXN" +}, +{ + "LocusId": "HSAN-VIII_PRDM12", + "ReferenceRegion": "chr9:142886568-142886595", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GCC", + "Disease": "HSAN VIII", + "NormalMax": 14.0, + "PathologicMin": 18.0, + "Gene": "PRDM12" +}, +{ + "LocusId": "OPML1_NUTM2B-AS1", + "ReferenceRegion": "chr10:80695718-80695748", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "OPML1", + "NormalMax": 16.0, + "PathologicMin": 161.0, + "Gene": "NUTM2B-AS1" +}, +{ + "LocusId": "JBS_CBL", + "ReferenceRegion": "chr11:119226662-119226696", + "LocusStructure": "(CGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CGG", + "Disease": "JBS", + "NormalMax": 79.0, + "PathologicMin": 101.0, + "Gene": "CBL" +}, +{ + "LocusId": "DRPLA_ATN1", + "ReferenceRegion": "chr12:6947903-6947941", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "DRPLA", + "NormalMax": 35.0, + "PathologicMin": 48.0, + "Gene": "ATN1" +}, +{ + "LocusId": "FRA12A_DIP2B", + "ReferenceRegion": "chr12:50468095-50468118", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "FRA12A", + "NormalMax": 23.0, + "PathologicMin": 273.0, + "Gene": "DIP2B" +}, +{ + "LocusId": "SCA2_ATXN2", + "ReferenceRegion": "chr12:111575873-111575940", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD", "AR"], + "DisplayRU": "CTG", + "Disease": "SCA2", + "NormalMax": 30.0, + "PathologicMin": 35.0, + "Gene": "ATXN2" +}, +{ + "LocusId": "OPDM4_RILPL1", + "ReferenceRegion": "chr12:123532573-123532603", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "OPDM4", + "NormalMax": 16.0, + "PathologicMin": 120.0, + "Gene": "RILPL1" +}, +{ + "LocusId": "SCA8_ATXN8OS", + "ReferenceRegion": ["chr13:69361213-69361243", "chr13:69361243-69361270"], + "LocusStructure": "(CTA)*(CTG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA8_ATXN8OS_CTA", "SCA8_ATXN8OS"], + "PathologicRegion": "chr13:69361243-69361270", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA8", + "NormalMax": 50.0, + "PathologicMin": 71.0, + "Gene": "ATXN8OS" +}, +{ + "LocusId": "HPE5_ZIC2", + "ReferenceRegion": "chr13:99196358-99196404", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "HPE5", + "NormalMax": 15.0, + "PathologicMin": 25.0, + "Gene": "ZIC2" +}, +{ + "LocusId": "SCA27B_FGF14", + "ReferenceRegion": "chr13:101377549-101377792", + "LocusStructure": "(GAA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GAA", + "Disease": "SCA27B", + "NormalMax": 179.0, + "PathologicMin": 320.0, + "Gene": "FGF14" +}, +{ + "LocusId": "OPMD_PABPN1", + "ReferenceRegion": "chr14:17522488-17522519", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD", "AR"], + "DisplayRU": "GCN", + "Disease": "OPMD", + "NormalMax": 10.0, + "PathologicMin": 12.0, + "Gene": "PABPN1" +}, +{ + "LocusId": "SCA3_ATXN3", + "ReferenceRegion": "chr14:86300519-86300603", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA3, MJD", + "NormalMax": 44.0, + "PathologicMin": 60.0, + "Gene": "ATXN3" +}, +{ + "LocusId": "ALS1_NIPA1", + "ReferenceRegion": "chr15:20458510-20458536", + "LocusStructure": "(GCG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCG", + "Disease": "ALS1", + "NormalMax": 10.0, + "PathologicMin": 11.0, + "Gene": "NIPA1" +}, +{ + "LocusId": "MIR7-2_CHNG3", + "ReferenceRegion": "chr15:86324038-86324057", + "LocusStructure": "(TTTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTG", + "Disease": "CHNG3", + "NormalMax": 4.0, + "PathologicMin": 5.0, + "Gene": "MIR7-2" +}, +{ + "LocusId": "CPEO_POLG", + "ReferenceRegion": ["chr15:87088402-87088408", "chr15:87088411-87088452"], + "LocusStructure": "(GCT)*GTT(GCT)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["CPEO_POLG_GCT", "CPEO_POLG"], + "PathologicRegion": "chr15:87088411-87088452", + "HGNCId": null, + "InheritanceMode": [], + "DisplayRU": "GCT", + "Disease": "CPEO", + "NormalMax": 10.0, + "PathologicMin": 11.0, + "Gene": "POLG" +}, +{ + "LocusId": "DBQD2_XYLT1", + "ReferenceRegion": ["chr16:17477864-17477885", "chr16:17477909-17478002"], + "LocusStructure": "(GCC)*TCGGCTCGCCGCTGCTCCTCCTCC(GCC)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["DBQD2_XYLT1_GCC", "DBQD2_XYLT1"], + "PathologicRegion": "chr16:17477909-17478002", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GCC", + "Disease": "DBQD2, BSS", + "NormalMax": 20.0, + "PathologicMin": 72.0, + "Gene": "XYLT1" +}, +{ + "LocusId": "FAME6_TNRC6A", + "ReferenceRegion": "chr16:24890366-24890430", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME6", + "NormalMax": 1099.0, + "PathologicMin": 1100.0, + "Gene": "TNRC6A" +}, +{ + "LocusId": "SCA31_BEAN1", + "ReferenceRegion": "chr16:72284666-72284761", + "LocusStructure": "(TGGAA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TGGAA", + "Disease": "SCA31", + "NormalMax": 109.0, + "PathologicMin": 110.0, + "Gene": "BEAN1" +}, +{ + "LocusId": "SCA_THAP11", + "ReferenceRegion": "chr16:73638636-73638724", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "SCA", + "NormalMax": 38.0, + "PathologicMin": 45.0, + "Gene": "THAP11" +}, +{ + "LocusId": "SCA4_ZFHX3", + "ReferenceRegion": "chr16:78605502-78605569", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCC", + "Disease": "SCA4", + "NormalMax": 26.0, + "PathologicMin": 46.0, + "Gene": "ZFHX3" +}, +{ + "LocusId": "HDL2_JPH3", + "ReferenceRegion": "chr16:93675723-93675776", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "HDL2", + "NormalMax": 28.0, + "PathologicMin": 40.0, + "Gene": "JPH3" +}, +{ + "LocusId": "FAME8_RAI1", + "ReferenceRegion": "chr17:17754961-17755053", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME8", + "NormalMax": 8.0, + "PathologicMin": 9.0, + "Gene": "RAI1" +}, +{ + "LocusId": "RCPS_EIF4A3", + "ReferenceRegion": "chr17:81047404-81047534", + "LocusStructure": "(CCTCGCTGTGCCGCTGCCGA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "CCTCGCTGTGCCGCTGCCGA", + "Disease": "RCPS", + "NormalMax": 12.0, + "PathologicMin": 14.0, + "Gene": "EIF4A3" +}, +{ + "LocusId": "CPUM_TYMS", + "ReferenceRegion": "chr18:821235-821905", + "LocusStructure": "(GATGGT)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GATGGT", + "Disease": "CPUM", + "NormalMax": 172, + "PathologicMin": 210, + "Gene": "TYMS" +}, +{ + "LocusId": "FECD3_TCF4", + "ReferenceRegion": "chr18:55789233-55789288", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "FECD3", + "NormalMax": 39.0, + "PathologicMin": 51.0, + "Gene": "TCF4" +}, +{ + "LocusId": "MRUPAV_PLIN4", + "ReferenceRegion": "chr19:4494212-4497342", + "LocusStructure": "(TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC", + "Disease": "MRUPAV", + "NormalMax": 31.0, + "PathologicMin": 37.0, + "Gene": "PLIN4" +}, +{ + "LocusId": "SCA6_CACNA1A", + "ReferenceRegion": "chr19:13333136-13333176", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA6", + "NormalMax": 18.0, + "PathologicMin": 21.0, + "Gene": "CACNA1A" +}, +{ + "LocusId": "OPDM2_GIPC1", + "ReferenceRegion": "chr19:14622655-14622692", + "LocusStructure": "(CCG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CCG", + "Disease": "OPDM2", + "NormalMax": 32.0, + "PathologicMin": 73.0, + "Gene": "GIPC1" +}, +{ + "LocusId": "EDM1-PSACH_COMP", + "ReferenceRegion": "chr19:18921630-18921645", + "LocusStructure": "(GTC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GTC", + "Disease": "EDM1, PSACH", + "NormalMax": 5.0, + "PathologicMin": 6.0, + "Gene": "COMP" +}, +{ + "LocusId": "DM1_DMPK", + "ReferenceRegion": "chr19:48597739-48597756", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "DM1", + "NormalMax": 34.0, + "PathologicMin": 50.0, + "Gene": "DMPK" +}, +{ + "LocusId": "SCA36_NOP56", + "ReferenceRegion": ["chr20:2683189-2683230", "chr20:2683230-2683248"], + "LocusStructure": "(GGCCTG)*(CGCCTG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA36_NOP56", "SCA36_NOP56_CGCCTG"], + "PathologicRegion": "chr20:2683189-2683230", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGCCTG", + "Disease": "SCA36", + "NormalMax": 14.0, + "PathologicMin": 650.0, + "Gene": "NOP56" +}, +{ + "LocusId": "CJD_PRNP", + "ReferenceRegion": ["chr20:4738606-4738633", "chr20:4738633-4738705"], + "LocusStructure": "(CCTCAGGGCGGTGGTGGCTGGGGGCAG)*(CCTCATGGTGGTGGCTGGGGGCAG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["CJD_PRNP_CCTCAGGGCGGTGGTGGCTGGGGGCAG", "CJD_PRNP"], + "PathologicRegion": "chr20:4738633-4738705", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CCTCATGGTGGTGGCTGGGGGCAG", + "Disease": "CJD", + "NormalMax": 4.0, + "PathologicMin": 5.0, + "Gene": "PRNP" +}, +{ + "LocusId": "EPM1_CSTB", + "ReferenceRegion": "chr21:42132054-42132091", + "LocusStructure": "(CGCGGGGCGGGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "CGCGGGGCGGGG", + "Disease": "EPM1", + "NormalMax": 3.0, + "PathologicMin": 30.0, + "Gene": "CSTB" +}, +{ + "LocusId": "TOF_TBX1", + "ReferenceRegion": "chr22:20143615-20143660", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "TOF", + "NormalMax": 15.0, + "PathologicMin": 25.0, + "Gene": "TBX1" +}, +{ + "LocusId": "SCA10_ATXN10", + "ReferenceRegion": "chr22:46280059-46280134", + "LocusStructure": "(ATTCT)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "ATTCT", + "Disease": "SCA10", + "NormalMax": 32.0, + "PathologicMin": 800.0, + "Gene": "ATXN10" +}, +{ + "LocusId": "PRTS_ARX", + "ReferenceRegion": "chrX:24597766-24597802", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "NGC", + "Disease": "PRTS", + "NormalMax": 12.0, + "PathologicMin": 20.0, + "Gene": "ARX" +}, +{ + "LocusId": "EIEE1_ARX", + "ReferenceRegion": "chrX:24597886-24597934", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "NGC", + "Disease": "EIEE1", + "NormalMax": 16.0, + "PathologicMin": 17.0, + "Gene": "ARX" +}, +{ + "LocusId": "DMD_DMD", + "ReferenceRegion": ["chrX:30882677-30882743", "chrX:30882743-30882751"], + "LocusStructure": "(TTC)*(T)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["DMD_DMD", "DMD_DMD_T"], + "PathologicRegion": "chrX:30882677-30882743", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "TTC", + "Disease": "DMD", + "NormalMax": 33.0, + "PathologicMin": 59.0, + "Gene": "DMD" +}, +{ + "LocusId": "SBMA_AR", + "ReferenceRegion": "chrX:65975147-65975250", + "LocusStructure": "(GCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "GCA", + "Disease": "SBMA", + "NormalMax": 34.0, + "PathologicMin": 38.0, + "Gene": "AR" +}, +{ + "LocusId": "XDP_TAF1", + "ReferenceRegion": "chrX:69887153-69887230", + "LocusStructure": "(AGAGGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "AGAGGG", + "Disease": "XDP", + "NormalMax": 34.0, + "PathologicMin": 35.0, + "Gene": "TAF1" +}, +{ + "LocusId": "VACTERLX_ZIC3", + "ReferenceRegion": "chrX:135876774-135876804", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "GCN", + "Disease": "VACTERLX", + "NormalMax": 10.0, + "PathologicMin": 12.0, + "Gene": "ZIC3" +}, +{ + "LocusId": "XLMR_SOX3", + "ReferenceRegion": "chrX:138816203-138816248", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "NGC", + "Disease": "XLMR", + "NormalMax": 15.0, + "PathologicMin": 22.0, + "Gene": "SOX3" +}, +{ + "LocusId": "FXS_FMR1", + "ReferenceRegion": "chrX:146176677-146176769", + "LocusStructure": "(CGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XD"], + "DisplayRU": "CGG", + "Disease": "FXS, FXTAS, POF1", + "NormalMax": 44.0, + "PathologicMin": 201.0, + "Gene": "FMR1" +}, +{ + "LocusId": "FRAXE_AFF2", + "ReferenceRegion": "chrX:146765190-146765342", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "GCC", + "Disease": "FRAXE", + "NormalMax": 39.0, + "PathologicMin": 201.0, + "Gene": "AFF2" +}] diff --git a/data/STRchive-disease-loci.hg19.TRGT.bed b/data/catalogs/STRchive-disease-loci.hg19.TRGT.bed similarity index 80% rename from data/STRchive-disease-loci.hg19.TRGT.bed rename to data/catalogs/STRchive-disease-loci.hg19.TRGT.bed index 3eb5ce48..594e86c8 100644 --- a/data/STRchive-disease-loci.hg19.TRGT.bed +++ b/data/catalogs/STRchive-disease-loci.hg19.TRGT.bed @@ -8,15 +8,15 @@ chr2 96862804 96862862 ID=FAME2_STARD7;MOTIFS=AAATG,AAAAT;STRUC= chr2 100721260 100721286 ID=FRA2A_AFF3;MOTIFS=GCC;STRUC= chr2 176957786 176957831 ID=SD5_HOXD13;MOTIFS=GCN;STRUC= chr2 191745598 191745646 ID=GDPAG_GLS;MOTIFS=GCA;STRUC= -chr3 63898360 63898403 ID=SCA7_ATXN7;MOTIFS=CAG,CCG;STRUC= -chr3 128891419 128891577 ID=DM2_CNBP;MOTIFS=CAGG,CAGA,CA;STRUC= +chr3 63898360 63898391 ID=SCA7_ATXN7;MOTIFS=CAG,CCG;STRUC= +chr3 128891419 128891499 ID=DM2_CNBP;MOTIFS=CAGG,CAGA,CA;STRUC= chr3 138664861 138664904 ID=BPES_FOXL2;MOTIFS=NGC;STRUC= -chr3 183429975 183430014 ID=FAME4_YEATS2;MOTIFS=TTTTA,TTTCA;STRUC= -chr4 3076603 3076696 ID=HD_HTT;MOTIFS=CAG,CCG;STRUC= +chr3 183429975 183430014 ID=FAME4_YEATS2;MOTIFS=TTTCA,TTTTA;STRUC= +chr4 3076603 3076660 ID=HD_HTT;MOTIFS=CAG,CCG;STRUC= chr4 39350044 39350103 ID=CANVAS_RFC1;MOTIFS=AAGGG,ACAGG,AGGGC,AAGGC,AGAGG,AAAAG,AAAGG,AAGAG,AAAGGG;STRUC= chr4 41747989 41748049 ID=CCHS_PHOX2B;MOTIFS=GCN;STRUC= -chr4 160263678 160263770 ID=FAME7_RAPGEF2;MOTIFS=TTTTA,TTTCA;STRUC= -chr5 10356455 10356523 ID=FAME3_MARCHF6;MOTIFS=TTTTA,TTTCA;STRUC= +chr4 160263678 160263770 ID=FAME7_RAPGEF2;MOTIFS=TTTCA,TTTTA;STRUC= +chr5 10356455 10356523 ID=FAME3_MARCHF6;MOTIFS=TTTCA,TTTTA;STRUC= chr5 146258290 146258322 ID=SCA12_PPP2R2B;MOTIFS=GCT;STRUC= chr6 16327864 16327955 ID=SCA1_ATXN1;MOTIFS=CTG;STRUC= chr6 45390487 45390538 ID=CCD_RUNX2;MOTIFS=GCN;STRUC= @@ -26,9 +26,9 @@ chr7 27239444 27239480 ID=HFG_HOXA13-II;MOTIFS=NGC;STRUC= chr7 27239543 27239585 ID=HFG_HOXA13-I;MOTIFS=NGC;STRUC= chr7 55955293 55955332 ID=FRA7A_ZNF713;MOTIFS=GCG;STRUC= chr8 105601198 105601227 ID=OPDM1_LRP12;MOTIFS=CGC;STRUC= -chr8 119379051 119379157 ID=FAME1_SAMD12;MOTIFS=TAAAA,TGAAA;STRUC= +chr8 119379051 119379157 ID=FAME1_SAMD12;MOTIFS=TGAAA,TAAAA;STRUC= chr9 27573482 27573544 ID=FTDALS1_C9orf72;MOTIFS=GGCCCC;STRUC= -chr9 71652186 71652220 ID=FRDA_FXN;MOTIFS=A,GAA;STRUC= +chr9 71652202 71652220 ID=FRDA_FXN;MOTIFS=A,GAA;STRUC= chr9 133556992 133557028 ID=HSAN-VIII_PRDM12;MOTIFS=GCC;STRUC= chr10 81586139 81586160 ID=OPML1_NUTM2B-AS1;MOTIFS=GGC;STRUC= chr11 119076999 119077033 ID=JBS_CBL;MOTIFS=CGG;STRUC= @@ -36,7 +36,7 @@ chr12 7045879 7045938 ID=DRPLA_ATN1;MOTIFS=CAG;STRUC= chr12 50898784 50898807 ID=FRA12A_DIP2B;MOTIFS=GGC;STRUC= chr12 112036753 112036823 ID=SCA2_ATXN2;MOTIFS=CTG;STRUC= chr12 124018267 124018297 ID=OPDM4_RILPL1;MOTIFS=GGC;STRUC= -chr13 70713485 70713561 ID=SCA8_ATXN8OS;MOTIFS=CTA,CTG;STRUC= +chr13 70713515 70713561 ID=SCA8_ATXN8OS;MOTIFS=CTA,CTG;STRUC= chr13 100637702 100637748 ID=HPE5_ZIC2;MOTIFS=GCN;STRUC= chr13 102813924 102814076 ID=SCA27B_FGF14;MOTIFS=GAA,GAAGGA,GAAGAAGAAGAAGCA,AAGGAG;STRUC= chr14 23790681 23790712 ID=OPMD_PABPN1;MOTIFS=GCN;STRUC= @@ -45,13 +45,13 @@ chr15 23086363 23086389 ID=ALS1_NIPA1;MOTIFS=GCG;STRUC= chr15 89112664 89112683 ID=MIR7-2_CHNG3;MOTIFS=TTTG;STRUC= chr15 89876819 89876860 ID=CPEO_POLG;MOTIFS=GCT;STRUC= chr16 17564764 17564779 ID=DBQD2_XYLT1;MOTIFS=GCC;STRUC= -chr16 24624759 24624853 ID=FAME6_TNRC6A;MOTIFS=TTTTA,TTTCA;STRUC= +chr16 24624759 24624853 ID=FAME6_TNRC6A;MOTIFS=TTTCA,TTTTA;STRUC= chr16 66524299 66524369 ID=SCA31_BEAN1;MOTIFS=TGGAA,TAGAA,AATAA;STRUC= chr16 67876765 67876853 ID=SCA_THAP11;MOTIFS=CAG;STRUC= chr16 72821593 72821657 ID=SCA4_ZFHX3;MOTIFS=GCC;STRUC= chr16 87637888 87637935 ID=HDL2_JPH3;MOTIFS=CTG;STRUC= -chr17 17711672 17711774 ID=FAME8_RAI1;MOTIFS=ATTTT,TTTCA,TTTTA;STRUC= -chr17 78120808 78120938 ID=RCPS_EIF4A3;MOTIFS=CCTCGCTGCGCCGCTGCCGA,CCTCGCTGTGCCGCTGCCGA;STRUC= +chr17 17711672 17711774 ID=FAME8_RAI1;MOTIFS=TTTCA,TTTTA;STRUC= +chr17 78120808 78120938 ID=RCPS_EIF4A3;MOTIFS=CCTCGCTGTGCCGCTGCCGA;STRUC= chr18 666891 667632 ID=CPUM_TYMS;MOTIFS=GATGGT;STRUC= chr18 53253384 53253460 ID=FECD3_TCF4;MOTIFS=CAG;STRUC= chr19 4510739 4513671 ID=MRUPAV_PLIN4;MOTIFS=TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC;STRUC= @@ -59,14 +59,14 @@ chr19 13318672 13318712 ID=SCA6_CACNA1A;MOTIFS=CTG;STRUC= chr19 14606853 14606887 ID=OPDM2_GIPC1;MOTIFS=CCG;STRUC= chr19 18896844 18896860 ID=EDM1-PSACH_COMP;MOTIFS=GTC;STRUC= chr19 46273462 46273524 ID=DM1_DMPK;MOTIFS=CAG;STRUC= -chr20 2633378 2633421 ID=SCA36_NOP56;MOTIFS=GGCCTG,CGCCTG;STRUC= -chr20 4680016 4680139 ID=CJD_PRNP;MOTIFS=CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT;STRUC= +chr20 2633378 2633403 ID=SCA36_NOP56;MOTIFS=GGCCTG,CGCCTG;STRUC= +chr20 4680043 4680139 ID=CJD_PRNP;MOTIFS=CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT;STRUC= chr21 45196323 45196360 ID=EPM1_CSTB;MOTIFS=CGCGGGGCGGGG;STRUC= chr22 19754285 19754330 ID=TOF_TBX1;MOTIFS=GCN;STRUC= chr22 46191234 46191304 ID=SCA10_ATXN10;MOTIFS=ATTCT;STRUC= chrX 25031646 25031682 ID=PRTS_ARX;MOTIFS=NGC;STRUC= chrX 25031766 25031814 ID=EIEE1_ARX;MOTIFS=NGC;STRUC= -chrX 31302674 31302730 ID=DMD_DMD;MOTIFS=TTC,T;STRUC= +chrX 31302674 31302722 ID=DMD_DMD;MOTIFS=TTC,T;STRUC= chrX 66765158 66765261 ID=SBMA_AR;MOTIFS=GCA;STRUC= chrX 70672904 70672981 ID=XDP_TAF1;MOTIFS=AGAGGG;STRUC= chrX 136648985 136649015 ID=VACTERLX_ZIC3;MOTIFS=GCN;STRUC= diff --git a/data/catalogs/STRchive-disease-loci.hg19.atarva.bed b/data/catalogs/STRchive-disease-loci.hg19.atarva.bed new file mode 100644 index 00000000..7bcf827e --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.hg19.atarva.bed @@ -0,0 +1,88 @@ +#chrom start stop motif motif_len id +chr1 1371178 1371198 GGCGCGGAGC 10 HMNR7_VWA1 +chr1 57832680 57832715 AAAAT 5 SCA37_DAB1_flank +chr1 57832715 57832793 GAAAT 5 SCA37_DAB1 +chr1 94883977 94884000 GCC 3 OPDM5_ABCD3 +chr1 145209323 145209354 GGC 3 NIID_NOTCH2NLC +chr1 155160981 155162030 GCCCACGGTGTCACCTCGGCCCCGGACACCAGGCCGGCCCCGGGCTCCACCGCCCCCCCCA 61 ADTKD_MUC1 +chr1 156561557 156561575 GGGCC 5 NME_NAXE +chr2 96862804 96862862 AAATG 5 FAME2_STARD7 +chr2 100721260 100721286 GCC 3 FRA2A_AFF3 +chr2 176957786 176957831 GCN 3 SD5_HOXD13 +chr2 191745598 191745646 GCA 3 GDPAG_GLS +chr3 63898360 63898391 CAG 3 SCA7_ATXN7 +chr3 63898391 63898403 CCG 3 SCA7_ATXN7_flank +chr3 128891419 128891499 CAGG 4 DM2_CNBP +chr3 128891499 128891539 CAGA 4 DM2_CNBP_flank +chr3 128891539 128891577 CA 2 DM2_CNBP_flank +chr3 138664861 138664904 NGC 3 BPES_FOXL2 +chr3 183429975 183430014 TTTCA 5 FAME4_YEATS2 +chr4 3076603 3076660 CAG 3 HD_HTT +chr4 3076666 3076702 CCG 3 HD_HTT_flank +chr4 39350044 39350103 AAGGG 5 CANVAS_RFC1 +chr4 41747989 41748049 GCN 3 CCHS_PHOX2B +chr4 160263678 160263770 TTTCA 5 FAME7_RAPGEF2 +chr5 10356455 10356523 TTTCA 5 FAME3_MARCHF6 +chr5 146258290 146258322 GCT 3 SCA12_PPP2R2B +chr6 16327864 16327955 CTG 3 SCA1_ATXN1 +chr6 45390487 45390538 GCN 3 CCD_RUNX2 +chr6 170870994 170871105 GCA 3 SCA17_TBP +chr7 27239297 27239351 NGC 3 HFG_HOXA13-III +chr7 27239444 27239480 NGC 3 HFG_HOXA13-II +chr7 27239543 27239585 NGC 3 HFG_HOXA13-I +chr7 55955293 55955332 GCG 3 FRA7A_ZNF713 +chr8 105601198 105601227 CGC 3 OPDM1_LRP12 +chr8 119379051 119379157 TGAAA 5 FAME1_SAMD12 +chr9 27573482 27573544 GGCCCC 6 FTDALS1_C9orf72 +chr9 71652186 71652202 A 1 FRDA_FXN_flank +chr9 71652202 71652220 GAA 3 FRDA_FXN +chr9 133556992 133557028 GCC 3 HSAN-VIII_PRDM12 +chr10 81586139 81586160 GGC 3 OPML1_NUTM2B-AS1 +chr11 119076999 119077033 CGG 3 JBS_CBL +chr12 7045879 7045938 CAG 3 DRPLA_ATN1 +chr12 50898784 50898807 GGC 3 FRA12A_DIP2B +chr12 112036753 112036823 CTG 3 SCA2_ATXN2 +chr12 124018267 124018297 GGC 3 OPDM4_RILPL1 +chr13 70713485 70713515 CTA 3 SCA8_ATXN8OS_flank +chr13 70713515 70713561 CTG 3 SCA8_ATXN8OS +chr13 100637702 100637748 GCN 3 HPE5_ZIC2 +chr13 102813924 102814076 GAA 3 SCA27B_FGF14 +chr14 23790681 23790712 GCN 3 OPMD_PABPN1 +chr14 92537354 92537396 CTG 3 SCA3_ATXN3 +chr15 23086363 23086389 GCG 3 ALS1_NIPA1 +chr15 89112664 89112683 TTTG 4 MIR7-2_CHNG3 +chr15 89876810 89876816 GCT 3 CPEO_POLG_flank +chr15 89876819 89876860 GCT 3 CPEO_POLG +chr16 17564719 17564740 GCC 3 DBQD2_XYLT1_flank +chr16 17564764 17564779 GCC 3 DBQD2_XYLT1 +chr16 24624759 24624853 TTTCA 5 FAME6_TNRC6A +chr16 66524299 66524369 TGGAA 5 SCA31_BEAN1 +chr16 67876765 67876853 CAG 3 SCA_THAP11 +chr16 72821593 72821657 GCC 3 SCA4_ZFHX3 +chr16 87637888 87637935 CTG 3 HDL2_JPH3 +chr17 17711672 17711774 TTTCA 5 FAME8_RAI1 +chr17 78120808 78120938 CCTCGCTGTGCCGCTGCCGA 20 RCPS_EIF4A3 +chr18 666891 667632 GATGGT 6 CPUM_TYMS +chr18 53253384 53253460 CAG 3 FECD3_TCF4 +chr19 4510739 4513671 TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC 99 MRUPAV_PLIN4 +chr19 13318672 13318712 CTG 3 SCA6_CACNA1A +chr19 14606853 14606887 CCG 3 OPDM2_GIPC1 +chr19 18896844 18896860 GTC 3 EDM1-PSACH_COMP +chr19 46273462 46273524 CAG 3 DM1_DMPK +chr20 2633378 2633403 GGCCTG 6 SCA36_NOP56 +chr20 2633403 2633421 CGCCTG 6 SCA36_NOP56_flank +chr20 4680016 4680043 CCTCAGGGCGGTGGTGGCTGGGGGCAG 27 CJD_PRNP_flank +chr20 4680043 4680139 CCTCATGGTGGTGGCTGGGGGCAG 24 CJD_PRNP +chr21 45196323 45196360 CGCGGGGCGGGG 12 EPM1_CSTB +chr22 19754285 19754330 GCN 3 TOF_TBX1 +chr22 46191234 46191304 ATTCT 5 SCA10_ATXN10 +chrX 25031646 25031682 NGC 3 PRTS_ARX +chrX 25031766 25031814 NGC 3 EIEE1_ARX +chrX 31302674 31302722 TTC 3 DMD_DMD +chrX 31302722 31302730 T 1 DMD_DMD_flank +chrX 66765158 66765261 GCA 3 SBMA_AR +chrX 70672904 70672981 AGAGGG 6 XDP_TAF1 +chrX 136648985 136649015 GCN 3 VACTERLX_ZIC3 +chrX 139586481 139586526 NGC 3 XLMR_SOX3 +chrX 146993567 146993629 CGG 3 FXS_FMR1 +chrX 147582124 147582273 GCC 3 FRAXE_AFF2 diff --git a/data/catalogs/STRchive-disease-loci.hg19.atarva.bed.gz b/data/catalogs/STRchive-disease-loci.hg19.atarva.bed.gz new file mode 100644 index 00000000..84e35315 Binary files /dev/null and b/data/catalogs/STRchive-disease-loci.hg19.atarva.bed.gz differ diff --git a/data/catalogs/STRchive-disease-loci.hg19.atarva.bed.gz.tbi b/data/catalogs/STRchive-disease-loci.hg19.atarva.bed.gz.tbi new file mode 100644 index 00000000..a9190996 Binary files /dev/null and b/data/catalogs/STRchive-disease-loci.hg19.atarva.bed.gz.tbi differ diff --git a/data/STRchive-disease-loci.hg19.bed b/data/catalogs/STRchive-disease-loci.hg19.general.bed similarity index 100% rename from data/STRchive-disease-loci.hg19.bed rename to data/catalogs/STRchive-disease-loci.hg19.general.bed diff --git a/data/catalogs/STRchive-disease-loci.hg19.longTR.bed b/data/catalogs/STRchive-disease-loci.hg19.longTR.bed new file mode 100644 index 00000000..7f729122 --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.hg19.longTR.bed @@ -0,0 +1,75 @@ +chr1 1371179 1371198 GGCGCGGAGC HMNR7_VWA1 +chr1 57832716 57832793 GAAAT,AAAAT SCA37_DAB1 +chr1 94883978 94884000 GCC OPDM5_ABCD3 +chr1 145209324 145209354 GGC NIID_NOTCH2NLC +chr1 155160982 155162030 GCCCACGGTGTCACCTCGGCCCCGGACACCAGGCCGGCCCCGGGCTCCACCGCCCCCCCCA,GCCCACGGTGTCACCTCGGCCCCGGACACCAGGCCGGCCCCGGGCTCCACCGCCCCCCCA,GGCTNNGGGNGCGGTGGAGCCCGGGGCNGGNCTGNTNTCCGGGGCCGAGGTGACANCNTG ADTKD_MUC1 +chr1 156561558 156561575 GGGCC NME_NAXE +chr2 96862805 96862862 AAATG,AAAAT FAME2_STARD7 +chr2 100721261 100721286 GCC FRA2A_AFF3 +chr2 176957787 176957831 GCN SD5_HOXD13 +chr2 191745599 191745646 GCA GDPAG_GLS +chr3 63898361 63898391 CAG SCA7_ATXN7 +chr3 128891420 128891499 CAGG DM2_CNBP +chr3 138664862 138664904 NGC BPES_FOXL2 +chr3 183429976 183430014 TTTCA,TTTTA FAME4_YEATS2 +chr4 3076604 3076660 CAG HD_HTT +chr4 39350045 39350103 AAGGG,ACAGG,AGGGC,AAGGC,AGAGG,AAAAG,AAAGG,AAGAG,AAAGGG CANVAS_RFC1 +chr4 41747990 41748049 GCN CCHS_PHOX2B +chr4 160263679 160263770 TTTCA,TTTTA FAME7_RAPGEF2 +chr5 10356456 10356523 TTTCA,TTTTA FAME3_MARCHF6 +chr5 146258291 146258322 GCT SCA12_PPP2R2B +chr6 16327865 16327955 CTG SCA1_ATXN1 +chr6 45390488 45390538 GCN CCD_RUNX2 +chr6 170870995 170871105 GCA SCA17_TBP +chr7 27239298 27239351 NGC HFG_HOXA13-III +chr7 27239445 27239480 NGC HFG_HOXA13-II +chr7 27239544 27239585 NGC HFG_HOXA13-I +chr7 55955294 55955332 GCG FRA7A_ZNF713 +chr8 105601199 105601227 CGC OPDM1_LRP12 +chr8 119379052 119379157 TGAAA,TAAAA FAME1_SAMD12 +chr9 27573483 27573544 GGCCCC FTDALS1_C9orf72 +chr9 71652203 71652220 GAA FRDA_FXN +chr9 133556993 133557028 GCC HSAN-VIII_PRDM12 +chr10 81586140 81586160 GGC OPML1_NUTM2B-AS1 +chr11 119077000 119077033 CGG JBS_CBL +chr12 7045880 7045938 CAG DRPLA_ATN1 +chr12 50898785 50898807 GGC FRA12A_DIP2B +chr12 112036754 112036823 CTG SCA2_ATXN2 +chr12 124018268 124018297 GGC OPDM4_RILPL1 +chr13 70713516 70713561 CTG SCA8_ATXN8OS +chr13 100637703 100637748 GCN HPE5_ZIC2 +chr13 102813925 102814076 GAA,GAAGGA,GAAGAAGAAGAAGCA,AAGGAG SCA27B_FGF14 +chr14 23790682 23790712 GCN OPMD_PABPN1 +chr14 92537355 92537396 CTG SCA3_ATXN3 +chr15 23086364 23086389 GCG ALS1_NIPA1 +chr15 89112665 89112683 TTTG MIR7-2_CHNG3 +chr15 89876820 89876860 GCT CPEO_POLG +chr16 17564765 17564779 GCC DBQD2_XYLT1 +chr16 24624760 24624853 TTTCA,TTTTA FAME6_TNRC6A +chr16 66524300 66524369 TGGAA,TAGAA,AATAA SCA31_BEAN1 +chr16 67876766 67876853 CAG SCA_THAP11 +chr16 72821594 72821657 GCC SCA4_ZFHX3 +chr16 87637889 87637935 CTG HDL2_JPH3 +chr17 17711673 17711774 TTTCA,TTTTA FAME8_RAI1 +chr17 78120809 78120938 CCTCGCTGTGCCGCTGCCGA RCPS_EIF4A3 +chr18 666892 667632 GATGGT CPUM_TYMS +chr18 53253385 53253460 CAG FECD3_TCF4 +chr19 4510740 4513671 TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC MRUPAV_PLIN4 +chr19 13318673 13318712 CTG SCA6_CACNA1A +chr19 14606854 14606887 CCG OPDM2_GIPC1 +chr19 18896845 18896860 GTC EDM1-PSACH_COMP +chr19 46273463 46273524 CAG DM1_DMPK +chr20 2633379 2633403 GGCCTG SCA36_NOP56 +chr20 4680044 4680139 CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT CJD_PRNP +chr21 45196324 45196360 CGCGGGGCGGGG EPM1_CSTB +chr22 19754286 19754330 GCN TOF_TBX1 +chr22 46191235 46191304 ATTCT SCA10_ATXN10 +chrX 25031647 25031682 NGC PRTS_ARX +chrX 25031767 25031814 NGC EIEE1_ARX +chrX 31302675 31302722 TTC DMD_DMD +chrX 66765159 66765261 GCA SBMA_AR +chrX 70672905 70672981 AGAGGG XDP_TAF1 +chrX 136648986 136649015 GCN VACTERLX_ZIC3 +chrX 139586482 139586526 NGC XLMR_SOX3 +chrX 146993568 146993629 CGG FXS_FMR1 +chrX 147582125 147582273 GCC FRAXE_AFF2 diff --git a/data/catalogs/STRchive-disease-loci.hg19.straglr.bed b/data/catalogs/STRchive-disease-loci.hg19.straglr.bed new file mode 100644 index 00000000..eb77b06c --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.hg19.straglr.bed @@ -0,0 +1,86 @@ +chr1 1371178 1371198 GGCGCGGAGC HMNR7_VWA1 HMNR7_VWA1 +chr1 57832680 57832715 AAAAT SCA37_DAB1 SCA37_DAB1_AAAAT +chr1 57832715 57832793 GAAAT SCA37_DAB1 SCA37_DAB1 +chr1 94883977 94884000 GCC OPDM5_ABCD3 OPDM5_ABCD3 +chr1 145209323 145209354 GGC NIID_NOTCH2NLC NIID_NOTCH2NLC +chr1 156561557 156561575 GGGCC NME_NAXE NME_NAXE +chr2 96862804 96862862 AAATG FAME2_STARD7 FAME2_STARD7 +chr2 100721260 100721286 GCC FRA2A_AFF3 FRA2A_AFF3 +chr2 176957786 176957831 GCN SD5_HOXD13 SD5_HOXD13 +chr2 191745598 191745646 GCA GDPAG_GLS GDPAG_GLS +chr3 63898360 63898391 CAG SCA7_ATXN7 SCA7_ATXN7 +chr3 63898391 63898403 CCG SCA7_ATXN7 SCA7_ATXN7_CCG +chr3 128891419 128891499 CAGG DM2_CNBP DM2_CNBP +chr3 128891499 128891539 CAGA DM2_CNBP DM2_CNBP_CAGA +chr3 128891539 128891577 CA DM2_CNBP DM2_CNBP_CA +chr3 138664861 138664904 NGC BPES_FOXL2 BPES_FOXL2 +chr3 183429975 183430014 TTTCA FAME4_YEATS2 FAME4_YEATS2 +chr4 3076603 3076660 CAG HD_HTT HD_HTT +chr4 3076666 3076702 CCG HD_HTT HD_HTT_CCG +chr4 39350044 39350103 AAGGG CANVAS_RFC1 CANVAS_RFC1 +chr4 41747989 41748049 GCN CCHS_PHOX2B CCHS_PHOX2B +chr4 160263678 160263770 TTTCA FAME7_RAPGEF2 FAME7_RAPGEF2 +chr5 10356455 10356523 TTTCA FAME3_MARCHF6 FAME3_MARCHF6 +chr5 146258290 146258322 GCT SCA12_PPP2R2B SCA12_PPP2R2B +chr6 16327864 16327955 CTG SCA1_ATXN1 SCA1_ATXN1 +chr6 45390487 45390538 GCN CCD_RUNX2 CCD_RUNX2 +chr6 170870994 170871105 GCA SCA17_TBP SCA17_TBP +chr7 27239297 27239351 NGC HFG_HOXA13-III HFG_HOXA13-III +chr7 27239444 27239480 NGC HFG_HOXA13-II HFG_HOXA13-II +chr7 27239543 27239585 NGC HFG_HOXA13-I HFG_HOXA13-I +chr7 55955293 55955332 GCG FRA7A_ZNF713 FRA7A_ZNF713 +chr8 105601198 105601227 CGC OPDM1_LRP12 OPDM1_LRP12 +chr8 119379051 119379157 TGAAA FAME1_SAMD12 FAME1_SAMD12 +chr9 27573482 27573544 GGCCCC FTDALS1_C9orf72 FTDALS1_C9orf72 +chr9 71652186 71652202 A FRDA_FXN FRDA_FXN_A +chr9 71652202 71652220 GAA FRDA_FXN FRDA_FXN +chr9 133556992 133557028 GCC HSAN-VIII_PRDM12 HSAN-VIII_PRDM12 +chr10 81586139 81586160 GGC OPML1_NUTM2B-AS1 OPML1_NUTM2B-AS1 +chr11 119076999 119077033 CGG JBS_CBL JBS_CBL +chr12 7045879 7045938 CAG DRPLA_ATN1 DRPLA_ATN1 +chr12 50898784 50898807 GGC FRA12A_DIP2B FRA12A_DIP2B +chr12 112036753 112036823 CTG SCA2_ATXN2 SCA2_ATXN2 +chr12 124018267 124018297 GGC OPDM4_RILPL1 OPDM4_RILPL1 +chr13 70713485 70713515 CTA SCA8_ATXN8OS SCA8_ATXN8OS_CTA +chr13 70713515 70713561 CTG SCA8_ATXN8OS SCA8_ATXN8OS +chr13 100637702 100637748 GCN HPE5_ZIC2 HPE5_ZIC2 +chr13 102813924 102814076 GAA SCA27B_FGF14 SCA27B_FGF14 +chr14 23790681 23790712 GCN OPMD_PABPN1 OPMD_PABPN1 +chr14 92537354 92537396 CTG SCA3_ATXN3 SCA3_ATXN3 +chr15 23086363 23086389 GCG ALS1_NIPA1 ALS1_NIPA1 +chr15 89112664 89112683 TTTG MIR7-2_CHNG3 MIR7-2_CHNG3 +chr15 89876810 89876816 GCT CPEO_POLG CPEO_POLG_GCT +chr15 89876819 89876860 GCT CPEO_POLG CPEO_POLG +chr16 17564719 17564740 GCC DBQD2_XYLT1 DBQD2_XYLT1_GCC +chr16 17564764 17564779 GCC DBQD2_XYLT1 DBQD2_XYLT1 +chr16 24624759 24624853 TTTCA FAME6_TNRC6A FAME6_TNRC6A +chr16 66524299 66524369 TGGAA SCA31_BEAN1 SCA31_BEAN1 +chr16 67876765 67876853 CAG SCA_THAP11 SCA_THAP11 +chr16 72821593 72821657 GCC SCA4_ZFHX3 SCA4_ZFHX3 +chr16 87637888 87637935 CTG HDL2_JPH3 HDL2_JPH3 +chr17 17711672 17711774 TTTCA FAME8_RAI1 FAME8_RAI1 +chr17 78120808 78120938 CCTCGCTGTGCCGCTGCCGA RCPS_EIF4A3 RCPS_EIF4A3 +chr18 666891 667632 GATGGT CPUM_TYMS CPUM_TYMS +chr18 53253384 53253460 CAG FECD3_TCF4 FECD3_TCF4 +chr19 4510739 4513671 TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC MRUPAV_PLIN4 MRUPAV_PLIN4 +chr19 13318672 13318712 CTG SCA6_CACNA1A SCA6_CACNA1A +chr19 14606853 14606887 CCG OPDM2_GIPC1 OPDM2_GIPC1 +chr19 18896844 18896860 GTC EDM1-PSACH_COMP EDM1-PSACH_COMP +chr19 46273462 46273524 CAG DM1_DMPK DM1_DMPK +chr20 2633378 2633403 GGCCTG SCA36_NOP56 SCA36_NOP56 +chr20 2633403 2633421 CGCCTG SCA36_NOP56 SCA36_NOP56_CGCCTG +chr20 4680016 4680043 CCTCAGGGCGGTGGTGGCTGGGGGCAG CJD_PRNP CJD_PRNP_CCTCAGGGCGGTGGTGGCTGGGGGCAG +chr20 4680043 4680139 CCTCATGGTGGTGGCTGGGGGCAG CJD_PRNP CJD_PRNP +chr21 45196323 45196360 CGCGGGGCGGGG EPM1_CSTB EPM1_CSTB +chr22 19754285 19754330 GCN TOF_TBX1 TOF_TBX1 +chr22 46191234 46191304 ATTCT SCA10_ATXN10 SCA10_ATXN10 +chrX 25031646 25031682 NGC PRTS_ARX PRTS_ARX +chrX 25031766 25031814 NGC EIEE1_ARX EIEE1_ARX +chrX 31302674 31302722 TTC DMD_DMD DMD_DMD +chrX 31302722 31302730 T DMD_DMD DMD_DMD_T +chrX 66765158 66765261 GCA SBMA_AR SBMA_AR +chrX 70672904 70672981 AGAGGG XDP_TAF1 XDP_TAF1 +chrX 136648985 136649015 GCN VACTERLX_ZIC3 VACTERLX_ZIC3 +chrX 139586481 139586526 NGC XLMR_SOX3 XLMR_SOX3 +chrX 146993567 146993629 CGG FXS_FMR1 FXS_FMR1 +chrX 147582124 147582273 GCC FRAXE_AFF2 FRAXE_AFF2 diff --git a/data/catalogs/STRchive-disease-loci.hg19.stranger.json b/data/catalogs/STRchive-disease-loci.hg19.stranger.json new file mode 100644 index 00000000..15d14fbb --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.hg19.stranger.json @@ -0,0 +1,985 @@ +[ +{ + "LocusId": "HMNR7_VWA1", + "ReferenceRegion": "chr1:1371178-1371198", + "LocusStructure": "(GGCGCGGAGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GGCGCGGAGC", + "Disease": "HMNR7", + "NormalMax": 2.0, + "PathologicMin": 3.0, + "Gene": "VWA1" +}, +{ + "LocusId": "SCA37_DAB1", + "ReferenceRegion": ["chr1:57832680-57832715", "chr1:57832715-57832793"], + "LocusStructure": "(AAAAT)*(GAAAT)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA37_DAB1_AAAAT", "SCA37_DAB1"], + "PathologicRegion": "chr1:57832715-57832793", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GAAAT", + "Disease": "SCA37", + "NormalMax": 30.0, + "PathologicMin": 31.0, + "Gene": "DAB1" +}, +{ + "LocusId": "OPDM5_ABCD3", + "ReferenceRegion": "chr1:94883977-94884000", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCC", + "Disease": "OPDM5", + "NormalMax": 44.0, + "PathologicMin": 118.0, + "Gene": "ABCD3" +}, +{ + "LocusId": "NIID_NOTCH2NLC", + "ReferenceRegion": "chr1:145209323-145209354", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "NIID", + "NormalMax": 37.0, + "PathologicMin": 66.0, + "Gene": "NOTCH2NLC" +}, +{ + "LocusId": "NME_NAXE", + "ReferenceRegion": "chr1:156561557-156561575", + "LocusStructure": "(GGGCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GGGCC", + "Disease": "NME", + "NormalMax": 7.0, + "PathologicMin": 200.0, + "Gene": "NAXE" +}, +{ + "LocusId": "FAME2_STARD7", + "ReferenceRegion": "chr2:96862804-96862862", + "LocusStructure": "(AAATG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "AAATG", + "Disease": "FAME2", + "NormalMax": 273.0, + "PathologicMin": 274.0, + "Gene": "STARD7" +}, +{ + "LocusId": "FRA2A_AFF3", + "ReferenceRegion": "chr2:100721260-100721286", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCC", + "Disease": "FRA2A", + "NormalMax": 20.0, + "PathologicMin": 300.0, + "Gene": "AFF3" +}, +{ + "LocusId": "SD5_HOXD13", + "ReferenceRegion": "chr2:176957786-176957831", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "SD5", + "NormalMax": 15.0, + "PathologicMin": 22.0, + "Gene": "HOXD13" +}, +{ + "LocusId": "GDPAG_GLS", + "ReferenceRegion": "chr2:191745598-191745646", + "LocusStructure": "(GCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GCA", + "Disease": "GDPAG", + "NormalMax": 38.0, + "PathologicMin": 680.0, + "Gene": "GLS" +}, +{ + "LocusId": "SCA7_ATXN7", + "ReferenceRegion": ["chr3:63898360-63898391", "chr3:63898391-63898403"], + "LocusStructure": "(CAG)*(CCG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA7_ATXN7", "SCA7_ATXN7_CCG"], + "PathologicRegion": "chr3:63898360-63898391", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "SCA7", + "NormalMax": 27.0, + "PathologicMin": 37.0, + "Gene": "ATXN7" +}, +{ + "LocusId": "DM2_CNBP", + "ReferenceRegion": ["chr3:128891419-128891499", "chr3:128891499-128891539", "chr3:128891539-128891577"], + "LocusStructure": "(CAGG)*(CAGA)*(CA)*", + "VariantType": ["Repeat", "Repeat", "Repeat"], + "VariantId": ["DM2_CNBP", "DM2_CNBP_CAGA", "DM2_CNBP_CA"], + "PathologicRegion": "chr3:128891419-128891499", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAGG", + "Disease": "DM2", + "NormalMax": 26.0, + "PathologicMin": 75.0, + "Gene": "CNBP" +}, +{ + "LocusId": "BPES_FOXL2", + "ReferenceRegion": "chr3:138664861-138664904", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD", "AR"], + "DisplayRU": "NGC", + "Disease": "BPES", + "NormalMax": 14.0, + "PathologicMin": 15.0, + "Gene": "FOXL2" +}, +{ + "LocusId": "FAME4_YEATS2", + "ReferenceRegion": "chr3:183429975-183430014", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME4", + "NormalMax": 999.0, + "PathologicMin": 1000.0, + "Gene": "YEATS2" +}, +{ + "LocusId": "HD_HTT", + "ReferenceRegion": ["chr4:3076603-3076660", "chr4:3076666-3076702"], + "LocusStructure": "(CAG)*CAACAG(CCG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["HD_HTT", "HD_HTT_CCG"], + "PathologicRegion": "chr4:3076603-3076660", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "HD", + "NormalMax": 26.0, + "PathologicMin": 36.0, + "Gene": "HTT" +}, +{ + "LocusId": "CANVAS_RFC1", + "ReferenceRegion": "chr4:39350044-39350103", + "LocusStructure": "(AAGGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "AAGGG", + "Disease": "CANVAS", + "NormalMax": 11.0, + "PathologicMin": 400.0, + "Gene": "RFC1" +}, +{ + "LocusId": "CCHS_PHOX2B", + "ReferenceRegion": "chr4:41747989-41748049", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "CCHS", + "NormalMax": 20.0, + "PathologicMin": 26.0, + "Gene": "PHOX2B" +}, +{ + "LocusId": "FAME7_RAPGEF2", + "ReferenceRegion": "chr4:160263678-160263770", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME7", + "NormalMax": 59.0, + "PathologicMin": 60.0, + "Gene": "RAPGEF2" +}, +{ + "LocusId": "FAME3_MARCHF6", + "ReferenceRegion": "chr5:10356455-10356523", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME3", + "NormalMax": 790.0, + "PathologicMin": 791.0, + "Gene": "MARCHF6" +}, +{ + "LocusId": "SCA12_PPP2R2B", + "ReferenceRegion": "chr5:146258290-146258322", + "LocusStructure": "(GCT)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCT", + "Disease": "SCA12", + "NormalMax": 32.0, + "PathologicMin": 51.0, + "Gene": "PPP2R2B" +}, +{ + "LocusId": "SCA1_ATXN1", + "ReferenceRegion": "chr6:16327864-16327955", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA1", + "NormalMax": 35.0, + "PathologicMin": 39.0, + "Gene": "ATXN1" +}, +{ + "LocusId": "CCD_RUNX2", + "ReferenceRegion": "chr6:45390487-45390538", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "CCD", + "NormalMax": 17.0, + "PathologicMin": 20.0, + "Gene": "RUNX2" +}, +{ + "LocusId": "SCA17_TBP", + "ReferenceRegion": "chr6:170870994-170871105", + "LocusStructure": "(GCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCA", + "Disease": "SCA17", + "NormalMax": 40.0, + "PathologicMin": 49.0, + "Gene": "TBP" +}, +{ + "LocusId": "HFG_HOXA13-III", + "ReferenceRegion": "chr7:27239297-27239351", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "NGC", + "Disease": "HFG-III", + "NormalMax": 18.0, + "PathologicMin": 22.0, + "Gene": "HOXA13" +}, +{ + "LocusId": "HFG_HOXA13-II", + "ReferenceRegion": "chr7:27239444-27239480", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "NGC", + "Disease": "HFG-II", + "NormalMax": 12.0, + "PathologicMin": 18.0, + "Gene": "HOXA13" +}, +{ + "LocusId": "HFG_HOXA13-I", + "ReferenceRegion": "chr7:27239543-27239585", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "NGC", + "Disease": "HFG-I", + "NormalMax": 14.0, + "PathologicMin": 22.0, + "Gene": "HOXA13" +}, +{ + "LocusId": "FRA7A_ZNF713", + "ReferenceRegion": "chr7:55955293-55955332", + "LocusStructure": "(GCG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCG", + "Disease": "FRA7A", + "NormalMax": 22.0, + "PathologicMin": 450.0, + "Gene": "ZNF713" +}, +{ + "LocusId": "OPDM1_LRP12", + "ReferenceRegion": "chr8:105601198-105601227", + "LocusStructure": "(CGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CGC", + "Disease": "OPDM1", + "NormalMax": 45.0, + "PathologicMin": 85.0, + "Gene": "LRP12" +}, +{ + "LocusId": "FAME1_SAMD12", + "ReferenceRegion": "chr8:119379051-119379157", + "LocusStructure": "(TGAAA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TGAAA", + "Disease": "FAME1", + "NormalMax": 104.0, + "PathologicMin": 105.0, + "Gene": "SAMD12" +}, +{ + "LocusId": "FTDALS1_C9orf72", + "ReferenceRegion": "chr9:27573482-27573544", + "LocusStructure": "(GGCCCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGCCCC", + "Disease": "FTDALS1", + "NormalMax": 23.0, + "PathologicMin": 251.0, + "Gene": "C9orf72" +}, +{ + "LocusId": "FRDA_FXN", + "ReferenceRegion": ["chr9:71652186-71652202", "chr9:71652202-71652220"], + "LocusStructure": "(A)*(GAA)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["FRDA_FXN_A", "FRDA_FXN"], + "PathologicRegion": "chr9:71652202-71652220", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GAA", + "Disease": "FRDA", + "NormalMax": 33.0, + "PathologicMin": 56.0, + "Gene": "FXN" +}, +{ + "LocusId": "HSAN-VIII_PRDM12", + "ReferenceRegion": "chr9:133556992-133557028", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GCC", + "Disease": "HSAN VIII", + "NormalMax": 14.0, + "PathologicMin": 18.0, + "Gene": "PRDM12" +}, +{ + "LocusId": "OPML1_NUTM2B-AS1", + "ReferenceRegion": "chr10:81586139-81586160", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "OPML1", + "NormalMax": 16.0, + "PathologicMin": 161.0, + "Gene": "NUTM2B-AS1" +}, +{ + "LocusId": "JBS_CBL", + "ReferenceRegion": "chr11:119076999-119077033", + "LocusStructure": "(CGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CGG", + "Disease": "JBS", + "NormalMax": 79.0, + "PathologicMin": 101.0, + "Gene": "CBL" +}, +{ + "LocusId": "DRPLA_ATN1", + "ReferenceRegion": "chr12:7045879-7045938", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "DRPLA", + "NormalMax": 35.0, + "PathologicMin": 48.0, + "Gene": "ATN1" +}, +{ + "LocusId": "FRA12A_DIP2B", + "ReferenceRegion": "chr12:50898784-50898807", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "FRA12A", + "NormalMax": 23.0, + "PathologicMin": 273.0, + "Gene": "DIP2B" +}, +{ + "LocusId": "SCA2_ATXN2", + "ReferenceRegion": "chr12:112036753-112036823", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD", "AR"], + "DisplayRU": "CTG", + "Disease": "SCA2", + "NormalMax": 30.0, + "PathologicMin": 35.0, + "Gene": "ATXN2" +}, +{ + "LocusId": "OPDM4_RILPL1", + "ReferenceRegion": "chr12:124018267-124018297", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "OPDM4", + "NormalMax": 16.0, + "PathologicMin": 120.0, + "Gene": "RILPL1" +}, +{ + "LocusId": "SCA8_ATXN8OS", + "ReferenceRegion": ["chr13:70713485-70713515", "chr13:70713515-70713561"], + "LocusStructure": "(CTA)*(CTG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA8_ATXN8OS_CTA", "SCA8_ATXN8OS"], + "PathologicRegion": "chr13:70713515-70713561", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA8", + "NormalMax": 50.0, + "PathologicMin": 71.0, + "Gene": "ATXN8OS" +}, +{ + "LocusId": "HPE5_ZIC2", + "ReferenceRegion": "chr13:100637702-100637748", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "HPE5", + "NormalMax": 15.0, + "PathologicMin": 25.0, + "Gene": "ZIC2" +}, +{ + "LocusId": "SCA27B_FGF14", + "ReferenceRegion": "chr13:102813924-102814076", + "LocusStructure": "(GAA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GAA", + "Disease": "SCA27B", + "NormalMax": 179.0, + "PathologicMin": 320.0, + "Gene": "FGF14" +}, +{ + "LocusId": "OPMD_PABPN1", + "ReferenceRegion": "chr14:23790681-23790712", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD", "AR"], + "DisplayRU": "GCN", + "Disease": "OPMD", + "NormalMax": 10.0, + "PathologicMin": 12.0, + "Gene": "PABPN1" +}, +{ + "LocusId": "SCA3_ATXN3", + "ReferenceRegion": "chr14:92537354-92537396", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA3, MJD", + "NormalMax": 44.0, + "PathologicMin": 60.0, + "Gene": "ATXN3" +}, +{ + "LocusId": "ALS1_NIPA1", + "ReferenceRegion": "chr15:23086363-23086389", + "LocusStructure": "(GCG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCG", + "Disease": "ALS1", + "NormalMax": 10.0, + "PathologicMin": 11.0, + "Gene": "NIPA1" +}, +{ + "LocusId": "MIR7-2_CHNG3", + "ReferenceRegion": "chr15:89112664-89112683", + "LocusStructure": "(TTTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTG", + "Disease": "CHNG3", + "NormalMax": 4.0, + "PathologicMin": 5.0, + "Gene": "MIR7-2" +}, +{ + "LocusId": "CPEO_POLG", + "ReferenceRegion": ["chr15:89876810-89876816", "chr15:89876819-89876860"], + "LocusStructure": "(GCT)*GTT(GCT)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["CPEO_POLG_GCT", "CPEO_POLG"], + "PathologicRegion": "chr15:89876819-89876860", + "HGNCId": null, + "InheritanceMode": [], + "DisplayRU": "GCT", + "Disease": "CPEO", + "NormalMax": 10.0, + "PathologicMin": 11.0, + "Gene": "POLG" +}, +{ + "LocusId": "DBQD2_XYLT1", + "ReferenceRegion": ["chr16:17564719-17564740", "chr16:17564764-17564779"], + "LocusStructure": "(GCC)*TCGGCTCGCCGCTGCTCCTCCTCC(GCC)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["DBQD2_XYLT1_GCC", "DBQD2_XYLT1"], + "PathologicRegion": "chr16:17564764-17564779", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GCC", + "Disease": "DBQD2, BSS", + "NormalMax": 20.0, + "PathologicMin": 72.0, + "Gene": "XYLT1" +}, +{ + "LocusId": "FAME6_TNRC6A", + "ReferenceRegion": "chr16:24624759-24624853", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME6", + "NormalMax": 1099.0, + "PathologicMin": 1100.0, + "Gene": "TNRC6A" +}, +{ + "LocusId": "SCA31_BEAN1", + "ReferenceRegion": "chr16:66524299-66524369", + "LocusStructure": "(TGGAA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TGGAA", + "Disease": "SCA31", + "NormalMax": 109.0, + "PathologicMin": 110.0, + "Gene": "BEAN1" +}, +{ + "LocusId": "SCA_THAP11", + "ReferenceRegion": "chr16:67876765-67876853", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "SCA", + "NormalMax": 38.0, + "PathologicMin": 45.0, + "Gene": "THAP11" +}, +{ + "LocusId": "SCA4_ZFHX3", + "ReferenceRegion": "chr16:72821593-72821657", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCC", + "Disease": "SCA4", + "NormalMax": 26.0, + "PathologicMin": 46.0, + "Gene": "ZFHX3" +}, +{ + "LocusId": "HDL2_JPH3", + "ReferenceRegion": "chr16:87637888-87637935", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "HDL2", + "NormalMax": 28.0, + "PathologicMin": 40.0, + "Gene": "JPH3" +}, +{ + "LocusId": "FAME8_RAI1", + "ReferenceRegion": "chr17:17711672-17711774", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME8", + "NormalMax": 8.0, + "PathologicMin": 9.0, + "Gene": "RAI1" +}, +{ + "LocusId": "RCPS_EIF4A3", + "ReferenceRegion": "chr17:78120808-78120938", + "LocusStructure": "(CCTCGCTGTGCCGCTGCCGA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "CCTCGCTGTGCCGCTGCCGA", + "Disease": "RCPS", + "NormalMax": 12.0, + "PathologicMin": 14.0, + "Gene": "EIF4A3" +}, +{ + "LocusId": "CPUM_TYMS", + "ReferenceRegion": "chr18:666891-667632", + "LocusStructure": "(GATGGT)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GATGGT", + "Disease": "CPUM", + "NormalMax": 172, + "PathologicMin": 210, + "Gene": "TYMS" +}, +{ + "LocusId": "FECD3_TCF4", + "ReferenceRegion": "chr18:53253384-53253460", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "FECD3", + "NormalMax": 39.0, + "PathologicMin": 51.0, + "Gene": "TCF4" +}, +{ + "LocusId": "MRUPAV_PLIN4", + "ReferenceRegion": "chr19:4510739-4513671", + "LocusStructure": "(TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC", + "Disease": "MRUPAV", + "NormalMax": 31.0, + "PathologicMin": 37.0, + "Gene": "PLIN4" +}, +{ + "LocusId": "SCA6_CACNA1A", + "ReferenceRegion": "chr19:13318672-13318712", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA6", + "NormalMax": 18.0, + "PathologicMin": 21.0, + "Gene": "CACNA1A" +}, +{ + "LocusId": "OPDM2_GIPC1", + "ReferenceRegion": "chr19:14606853-14606887", + "LocusStructure": "(CCG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CCG", + "Disease": "OPDM2", + "NormalMax": 32.0, + "PathologicMin": 73.0, + "Gene": "GIPC1" +}, +{ + "LocusId": "EDM1-PSACH_COMP", + "ReferenceRegion": "chr19:18896844-18896860", + "LocusStructure": "(GTC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GTC", + "Disease": "EDM1, PSACH", + "NormalMax": 5.0, + "PathologicMin": 6.0, + "Gene": "COMP" +}, +{ + "LocusId": "DM1_DMPK", + "ReferenceRegion": "chr19:46273462-46273524", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "DM1", + "NormalMax": 34.0, + "PathologicMin": 50.0, + "Gene": "DMPK" +}, +{ + "LocusId": "SCA36_NOP56", + "ReferenceRegion": ["chr20:2633378-2633403", "chr20:2633403-2633421"], + "LocusStructure": "(GGCCTG)*(CGCCTG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA36_NOP56", "SCA36_NOP56_CGCCTG"], + "PathologicRegion": "chr20:2633378-2633403", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGCCTG", + "Disease": "SCA36", + "NormalMax": 14.0, + "PathologicMin": 650.0, + "Gene": "NOP56" +}, +{ + "LocusId": "CJD_PRNP", + "ReferenceRegion": ["chr20:4680016-4680043", "chr20:4680043-4680139"], + "LocusStructure": "(CCTCAGGGCGGTGGTGGCTGGGGGCAG)*(CCTCATGGTGGTGGCTGGGGGCAG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["CJD_PRNP_CCTCAGGGCGGTGGTGGCTGGGGGCAG", "CJD_PRNP"], + "PathologicRegion": "chr20:4680043-4680139", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CCTCATGGTGGTGGCTGGGGGCAG", + "Disease": "CJD", + "NormalMax": 4.0, + "PathologicMin": 5.0, + "Gene": "PRNP" +}, +{ + "LocusId": "EPM1_CSTB", + "ReferenceRegion": "chr21:45196323-45196360", + "LocusStructure": "(CGCGGGGCGGGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "CGCGGGGCGGGG", + "Disease": "EPM1", + "NormalMax": 3.0, + "PathologicMin": 30.0, + "Gene": "CSTB" +}, +{ + "LocusId": "TOF_TBX1", + "ReferenceRegion": "chr22:19754285-19754330", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "TOF", + "NormalMax": 15.0, + "PathologicMin": 25.0, + "Gene": "TBX1" +}, +{ + "LocusId": "SCA10_ATXN10", + "ReferenceRegion": "chr22:46191234-46191304", + "LocusStructure": "(ATTCT)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "ATTCT", + "Disease": "SCA10", + "NormalMax": 32.0, + "PathologicMin": 800.0, + "Gene": "ATXN10" +}, +{ + "LocusId": "PRTS_ARX", + "ReferenceRegion": "chrX:25031646-25031682", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "NGC", + "Disease": "PRTS", + "NormalMax": 12.0, + "PathologicMin": 20.0, + "Gene": "ARX" +}, +{ + "LocusId": "EIEE1_ARX", + "ReferenceRegion": "chrX:25031766-25031814", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "NGC", + "Disease": "EIEE1", + "NormalMax": 16.0, + "PathologicMin": 17.0, + "Gene": "ARX" +}, +{ + "LocusId": "DMD_DMD", + "ReferenceRegion": ["chrX:31302674-31302722", "chrX:31302722-31302730"], + "LocusStructure": "(TTC)*(T)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["DMD_DMD", "DMD_DMD_T"], + "PathologicRegion": "chrX:31302674-31302722", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "TTC", + "Disease": "DMD", + "NormalMax": 33.0, + "PathologicMin": 59.0, + "Gene": "DMD" +}, +{ + "LocusId": "SBMA_AR", + "ReferenceRegion": "chrX:66765158-66765261", + "LocusStructure": "(GCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "GCA", + "Disease": "SBMA", + "NormalMax": 34.0, + "PathologicMin": 38.0, + "Gene": "AR" +}, +{ + "LocusId": "XDP_TAF1", + "ReferenceRegion": "chrX:70672904-70672981", + "LocusStructure": "(AGAGGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "AGAGGG", + "Disease": "XDP", + "NormalMax": 34.0, + "PathologicMin": 35.0, + "Gene": "TAF1" +}, +{ + "LocusId": "VACTERLX_ZIC3", + "ReferenceRegion": "chrX:136648985-136649015", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "GCN", + "Disease": "VACTERLX", + "NormalMax": 10.0, + "PathologicMin": 12.0, + "Gene": "ZIC3" +}, +{ + "LocusId": "XLMR_SOX3", + "ReferenceRegion": "chrX:139586481-139586526", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "NGC", + "Disease": "XLMR", + "NormalMax": 15.0, + "PathologicMin": 22.0, + "Gene": "SOX3" +}, +{ + "LocusId": "FXS_FMR1", + "ReferenceRegion": "chrX:146993567-146993629", + "LocusStructure": "(CGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XD"], + "DisplayRU": "CGG", + "Disease": "FXS, FXTAS, POF1", + "NormalMax": 44.0, + "PathologicMin": 201.0, + "Gene": "FMR1" +}, +{ + "LocusId": "FRAXE_AFF2", + "ReferenceRegion": "chrX:147582124-147582273", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "GCC", + "Disease": "FRAXE", + "NormalMax": 39.0, + "PathologicMin": 201.0, + "Gene": "AFF2" +}] diff --git a/data/STRchive-disease-loci.hg38.TRGT.bed b/data/catalogs/STRchive-disease-loci.hg38.TRGT.bed similarity index 80% rename from data/STRchive-disease-loci.hg38.TRGT.bed rename to data/catalogs/STRchive-disease-loci.hg38.TRGT.bed index 140c35dd..c512bd8d 100644 --- a/data/STRchive-disease-loci.hg38.TRGT.bed +++ b/data/catalogs/STRchive-disease-loci.hg38.TRGT.bed @@ -8,15 +8,15 @@ chr2 96197066 96197124 ID=FAME2_STARD7;MOTIFS=AAATG,AAAAT;STRUC= chr2 100104798 100104824 ID=FRA2A_AFF3;MOTIFS=GCC;STRUC= chr2 176093058 176093103 ID=SD5_HOXD13;MOTIFS=GCN;STRUC= chr2 190880872 190880920 ID=GDPAG_GLS;MOTIFS=GCA;STRUC= -chr3 63912684 63912727 ID=SCA7_ATXN7;MOTIFS=CAG,CCG;STRUC= -chr3 129172576 129172734 ID=DM2_CNBP;MOTIFS=CAGG,CAGA,CA;STRUC= +chr3 63912684 63912715 ID=SCA7_ATXN7;MOTIFS=CAG,CCG;STRUC= +chr3 129172576 129172656 ID=DM2_CNBP;MOTIFS=CAGG,CAGA,CA;STRUC= chr3 138946019 138946062 ID=BPES_FOXL2;MOTIFS=NGC;STRUC= -chr3 183712187 183712226 ID=FAME4_YEATS2;MOTIFS=TTTTA,TTTCA;STRUC= -chr4 3074876 3074969 ID=HD_HTT;MOTIFS=CAG,CCG;STRUC= +chr3 183712187 183712226 ID=FAME4_YEATS2;MOTIFS=TTTCA,TTTTA;STRUC= +chr4 3074876 3074933 ID=HD_HTT;MOTIFS=CAG,CCG;STRUC= chr4 39348424 39348483 ID=CANVAS_RFC1;MOTIFS=AAGGG,ACAGG,AGGGC,AAGGC,AGAGG,AAAAG,AAAGG,AAGAG,AAAGGG;STRUC= chr4 41745972 41746032 ID=CCHS_PHOX2B;MOTIFS=GCN;STRUC= -chr4 159342526 159342618 ID=FAME7_RAPGEF2;MOTIFS=TTTTA,TTTCA;STRUC= -chr5 10356343 10356411 ID=FAME3_MARCHF6;MOTIFS=TTTTA,TTTCA;STRUC= +chr4 159342526 159342618 ID=FAME7_RAPGEF2;MOTIFS=TTTCA,TTTTA;STRUC= +chr5 10356343 10356411 ID=FAME3_MARCHF6;MOTIFS=TTTCA,TTTTA;STRUC= chr5 146878727 146878759 ID=SCA12_PPP2R2B;MOTIFS=GCT;STRUC= chr6 16327633 16327724 ID=SCA1_ATXN1;MOTIFS=CTG;STRUC= chr6 45422750 45422801 ID=CCD_RUNX2;MOTIFS=GCN;STRUC= @@ -26,9 +26,9 @@ chr7 27199825 27199861 ID=HFG_HOXA13-II;MOTIFS=NGC;STRUC= chr7 27199924 27199966 ID=HFG_HOXA13-I;MOTIFS=NGC;STRUC= chr7 55887600 55887639 ID=FRA7A_ZNF713;MOTIFS=GCG;STRUC= chr8 104588970 104588999 ID=OPDM1_LRP12;MOTIFS=CGC;STRUC= -chr8 118366812 118366918 ID=FAME1_SAMD12;MOTIFS=TAAAA,TGAAA;STRUC= +chr8 118366812 118366918 ID=FAME1_SAMD12;MOTIFS=TGAAA,TAAAA;STRUC= chr9 27573484 27573546 ID=FTDALS1_C9orf72;MOTIFS=GGCCCC;STRUC= -chr9 69037270 69037304 ID=FRDA_FXN;MOTIFS=A,GAA;STRUC= +chr9 69037286 69037304 ID=FRDA_FXN;MOTIFS=A,GAA;STRUC= chr9 130681605 130681641 ID=HSAN-VIII_PRDM12;MOTIFS=GCC;STRUC= chr10 79826383 79826404 ID=OPML1_NUTM2B-AS1;MOTIFS=GGC;STRUC= chr11 119206289 119206323 ID=JBS_CBL;MOTIFS=CGG;STRUC= @@ -36,7 +36,7 @@ chr12 6936716 6936775 ID=DRPLA_ATN1;MOTIFS=CAG;STRUC= chr12 50505001 50505024 ID=FRA12A_DIP2B;MOTIFS=GGC;STRUC= chr12 111598949 111599019 ID=SCA2_ATXN2;MOTIFS=CTG;STRUC= chr12 123533720 123533750 ID=OPDM4_RILPL1;MOTIFS=GGC;STRUC= -chr13 70139353 70139429 ID=SCA8_ATXN8OS;MOTIFS=CTA,CTG;STRUC= +chr13 70139383 70139429 ID=SCA8_ATXN8OS;MOTIFS=CTA,CTG;STRUC= chr13 99985448 99985494 ID=HPE5_ZIC2;MOTIFS=GCN;STRUC= chr13 102161574 102161726 ID=SCA27B_FGF14;MOTIFS=GAA,GAAGGA,GAAGAAGAAGAAGCA,AAGGAG;STRUC= chr14 23321472 23321503 ID=OPMD_PABPN1;MOTIFS=GCN;STRUC= @@ -45,13 +45,13 @@ chr15 22786677 22786703 ID=ALS1_NIPA1;MOTIFS=GCG;STRUC= chr15 88569433 88569452 ID=MIR7-2_CHNG3;MOTIFS=TTTG;STRUC= chr15 89333588 89333629 ID=CPEO_POLG;MOTIFS=GCT;STRUC= chr16 17470907 17470922 ID=DBQD2_XYLT1;MOTIFS=GCC;STRUC= -chr16 24613438 24613532 ID=FAME6_TNRC6A;MOTIFS=TTTTA,TTTCA;STRUC= +chr16 24613438 24613532 ID=FAME6_TNRC6A;MOTIFS=TTTCA,TTTTA;STRUC= chr16 66490396 66490466 ID=SCA31_BEAN1;MOTIFS=TGGAA,TAGAA,AATAA;STRUC= chr16 67842862 67842950 ID=SCA_THAP11;MOTIFS=CAG;STRUC= chr16 72787694 72787758 ID=SCA4_ZFHX3;MOTIFS=GCC;STRUC= chr16 87604282 87604329 ID=HDL2_JPH3;MOTIFS=CTG;STRUC= -chr17 17808358 17808460 ID=FAME8_RAI1;MOTIFS=ATTTT,TTTCA,TTTTA;STRUC= -chr17 80147009 80147139 ID=RCPS_EIF4A3;MOTIFS=CCTCGCTGCGCCGCTGCCGA,CCTCGCTGTGCCGCTGCCGA;STRUC= +chr17 17808358 17808460 ID=FAME8_RAI1;MOTIFS=TTTCA,TTTTA;STRUC= +chr17 80147009 80147139 ID=RCPS_EIF4A3;MOTIFS=CCTCGCTGTGCCGCTGCCGA;STRUC= chr18 666891 667632 ID=CPUM_TYMS;MOTIFS=GATGGT;STRUC= chr18 55586153 55586229 ID=FECD3_TCF4;MOTIFS=CAG;STRUC= chr19 4510727 4513659 ID=MRUPAV_PLIN4;MOTIFS=TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC;STRUC= @@ -59,14 +59,14 @@ chr19 13207858 13207898 ID=SCA6_CACNA1A;MOTIFS=CTG;STRUC= chr19 14496041 14496075 ID=OPDM2_GIPC1;MOTIFS=CCG;STRUC= chr19 18786034 18786050 ID=EDM1-PSACH_COMP;MOTIFS=GTC;STRUC= chr19 45770204 45770266 ID=DM1_DMPK;MOTIFS=CAG;STRUC= -chr20 2652732 2652775 ID=SCA36_NOP56;MOTIFS=GGCCTG,CGCCTG;STRUC= -chr20 4699370 4699493 ID=CJD_PRNP;MOTIFS=CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT;STRUC= +chr20 2652732 2652757 ID=SCA36_NOP56;MOTIFS=GGCCTG,CGCCTG;STRUC= +chr20 4699397 4699493 ID=CJD_PRNP;MOTIFS=CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT;STRUC= chr21 43776442 43776479 ID=EPM1_CSTB;MOTIFS=CGCGGGGCGGGG;STRUC= chr22 19766762 19766807 ID=TOF_TBX1;MOTIFS=GCN;STRUC= chr22 45795354 45795424 ID=SCA10_ATXN10;MOTIFS=ATTCT;STRUC= chrX 25013529 25013565 ID=PRTS_ARX;MOTIFS=NGC;STRUC= chrX 25013649 25013697 ID=EIEE1_ARX;MOTIFS=NGC;STRUC= -chrX 31284557 31284613 ID=DMD_DMD;MOTIFS=TTC,T;STRUC= +chrX 31284557 31284605 ID=DMD_DMD;MOTIFS=TTC,T;STRUC= chrX 67545316 67545419 ID=SBMA_AR;MOTIFS=GCA;STRUC= chrX 71453054 71453131 ID=XDP_TAF1;MOTIFS=AGAGGG;STRUC= chrX 137566826 137566856 ID=VACTERLX_ZIC3;MOTIFS=GCN;STRUC= diff --git a/data/catalogs/STRchive-disease-loci.hg38.atarva.bed b/data/catalogs/STRchive-disease-loci.hg38.atarva.bed new file mode 100644 index 00000000..e4f0f29e --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.hg38.atarva.bed @@ -0,0 +1,88 @@ +#chrom start stop motif motif_len id +chr1 1435798 1435818 GGCGCGGAGC 10 HMNR7_VWA1 +chr1 57367008 57367043 AAAAT 5 SCA37_DAB1_flank +chr1 57367043 57367121 GAAAT 5 SCA37_DAB1 +chr1 94418421 94418444 GCC 3 OPDM5_ABCD3 +chr1 149390802 149390842 GGC 3 NIID_NOTCH2NLC +chr1 155188505 155192239 GCCCACGGTGTCACCTCGGCCCCGGACACCAGGCCGGCCCCGGGCTCCACCGCCCCCCCCA 61 ADTKD_MUC1 +chr1 156591765 156591783 GGGCC 5 NME_NAXE +chr2 96197066 96197124 AAATG 5 FAME2_STARD7 +chr2 100104798 100104824 GCC 3 FRA2A_AFF3 +chr2 176093058 176093103 GCN 3 SD5_HOXD13 +chr2 190880872 190880920 GCA 3 GDPAG_GLS +chr3 63912684 63912715 CAG 3 SCA7_ATXN7 +chr3 63912715 63912727 CCG 3 SCA7_ATXN7_flank +chr3 129172576 129172656 CAGG 4 DM2_CNBP +chr3 129172656 129172696 CAGA 4 DM2_CNBP_flank +chr3 129172696 129172734 CA 2 DM2_CNBP_flank +chr3 138946019 138946062 NGC 3 BPES_FOXL2 +chr3 183712187 183712226 TTTCA 5 FAME4_YEATS2 +chr4 3074876 3074933 CAG 3 HD_HTT +chr4 3074939 3074975 CCG 3 HD_HTT_flank +chr4 39348424 39348483 AAGGG 5 CANVAS_RFC1 +chr4 41745972 41746032 GCN 3 CCHS_PHOX2B +chr4 159342526 159342618 TTTCA 5 FAME7_RAPGEF2 +chr5 10356343 10356411 TTTCA 5 FAME3_MARCHF6 +chr5 146878727 146878759 GCT 3 SCA12_PPP2R2B +chr6 16327633 16327724 CTG 3 SCA1_ATXN1 +chr6 45422750 45422801 GCN 3 CCD_RUNX2 +chr6 170561906 170562017 GCA 3 SCA17_TBP +chr7 27199678 27199732 NGC 3 HFG_HOXA13-III +chr7 27199825 27199861 NGC 3 HFG_HOXA13-II +chr7 27199924 27199966 NGC 3 HFG_HOXA13-I +chr7 55887600 55887639 GCG 3 FRA7A_ZNF713 +chr8 104588970 104588999 CGC 3 OPDM1_LRP12 +chr8 118366812 118366918 TGAAA 5 FAME1_SAMD12 +chr9 27573484 27573546 GGCCCC 6 FTDALS1_C9orf72 +chr9 69037270 69037286 A 1 FRDA_FXN_flank +chr9 69037286 69037304 GAA 3 FRDA_FXN +chr9 130681605 130681641 GCC 3 HSAN-VIII_PRDM12 +chr10 79826383 79826404 GGC 3 OPML1_NUTM2B-AS1 +chr11 119206289 119206323 CGG 3 JBS_CBL +chr12 6936716 6936775 CAG 3 DRPLA_ATN1 +chr12 50505001 50505024 GGC 3 FRA12A_DIP2B +chr12 111598949 111599019 CTG 3 SCA2_ATXN2 +chr12 123533720 123533750 GGC 3 OPDM4_RILPL1 +chr13 70139353 70139383 CTA 3 SCA8_ATXN8OS_flank +chr13 70139383 70139429 CTG 3 SCA8_ATXN8OS +chr13 99985448 99985494 GCN 3 HPE5_ZIC2 +chr13 102161574 102161726 GAA 3 SCA27B_FGF14 +chr14 23321472 23321503 GCN 3 OPMD_PABPN1 +chr14 92071010 92071052 CTG 3 SCA3_ATXN3 +chr15 22786677 22786703 GCG 3 ALS1_NIPA1 +chr15 88569433 88569452 TTTG 4 MIR7-2_CHNG3 +chr15 89333579 89333585 GCT 3 CPEO_POLG_flank +chr15 89333588 89333629 GCT 3 CPEO_POLG +chr16 17470862 17470883 GCC 3 DBQD2_XYLT1_flank +chr16 17470907 17470922 GCC 3 DBQD2_XYLT1 +chr16 24613438 24613532 TTTCA 5 FAME6_TNRC6A +chr16 66490396 66490466 TGGAA 5 SCA31_BEAN1 +chr16 67842862 67842950 CAG 3 SCA_THAP11 +chr16 72787694 72787758 GCC 3 SCA4_ZFHX3 +chr16 87604282 87604329 CTG 3 HDL2_JPH3 +chr17 17808358 17808460 TTTCA 5 FAME8_RAI1 +chr17 80147009 80147139 CCTCGCTGTGCCGCTGCCGA 20 RCPS_EIF4A3 +chr18 666891 667632 GATGGT 6 CPUM_TYMS +chr18 55586153 55586229 CAG 3 FECD3_TCF4 +chr19 4510727 4513659 TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC 99 MRUPAV_PLIN4 +chr19 13207858 13207898 CTG 3 SCA6_CACNA1A +chr19 14496041 14496075 CCG 3 OPDM2_GIPC1 +chr19 18786034 18786050 GTC 3 EDM1-PSACH_COMP +chr19 45770204 45770266 CAG 3 DM1_DMPK +chr20 2652732 2652757 GGCCTG 6 SCA36_NOP56 +chr20 2652757 2652775 CGCCTG 6 SCA36_NOP56_flank +chr20 4699370 4699397 CCTCAGGGCGGTGGTGGCTGGGGGCAG 27 CJD_PRNP_flank +chr20 4699397 4699493 CCTCATGGTGGTGGCTGGGGGCAG 24 CJD_PRNP +chr21 43776442 43776479 CGCGGGGCGGGG 12 EPM1_CSTB +chr22 19766762 19766807 GCN 3 TOF_TBX1 +chr22 45795354 45795424 ATTCT 5 SCA10_ATXN10 +chrX 25013529 25013565 NGC 3 PRTS_ARX +chrX 25013649 25013697 NGC 3 EIEE1_ARX +chrX 31284557 31284605 TTC 3 DMD_DMD +chrX 31284605 31284613 T 1 DMD_DMD_flank +chrX 67545316 67545419 GCA 3 SBMA_AR +chrX 71453054 71453131 AGAGGG 6 XDP_TAF1 +chrX 137566826 137566856 GCN 3 VACTERLX_ZIC3 +chrX 140504316 140504361 NGC 3 XLMR_SOX3 +chrX 147912049 147912111 CGG 3 FXS_FMR1 +chrX 148500604 148500753 GCC 3 FRAXE_AFF2 diff --git a/data/catalogs/STRchive-disease-loci.hg38.atarva.bed.gz b/data/catalogs/STRchive-disease-loci.hg38.atarva.bed.gz new file mode 100644 index 00000000..d921e768 Binary files /dev/null and b/data/catalogs/STRchive-disease-loci.hg38.atarva.bed.gz differ diff --git a/data/catalogs/STRchive-disease-loci.hg38.atarva.bed.gz.tbi b/data/catalogs/STRchive-disease-loci.hg38.atarva.bed.gz.tbi new file mode 100644 index 00000000..f2f63dd3 Binary files /dev/null and b/data/catalogs/STRchive-disease-loci.hg38.atarva.bed.gz.tbi differ diff --git a/data/STRchive-disease-loci.hg38.bed b/data/catalogs/STRchive-disease-loci.hg38.general.bed similarity index 100% rename from data/STRchive-disease-loci.hg38.bed rename to data/catalogs/STRchive-disease-loci.hg38.general.bed diff --git a/data/catalogs/STRchive-disease-loci.hg38.longTR.bed b/data/catalogs/STRchive-disease-loci.hg38.longTR.bed new file mode 100644 index 00000000..75337e50 --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.hg38.longTR.bed @@ -0,0 +1,75 @@ +chr1 1435799 1435818 GGCGCGGAGC HMNR7_VWA1 +chr1 57367044 57367121 GAAAT,AAAAT SCA37_DAB1 +chr1 94418422 94418444 GCC OPDM5_ABCD3 +chr1 149390803 149390842 GGC NIID_NOTCH2NLC +chr1 155188506 155192239 GCCCACGGTGTCACCTCGGCCCCGGACACCAGGCCGGCCCCGGGCTCCACCGCCCCCCCCA,GCCCACGGTGTCACCTCGGCCCCGGACACCAGGCCGGCCCCGGGCTCCACCGCCCCCCCA,GGCTNNGGGNGCGGTGGAGCCCGGGGCNGGNCTGNTNTCCGGGGCCGAGGTGACANCNTG ADTKD_MUC1 +chr1 156591766 156591783 GGGCC NME_NAXE +chr2 96197067 96197124 AAATG,AAAAT FAME2_STARD7 +chr2 100104799 100104824 GCC FRA2A_AFF3 +chr2 176093059 176093103 GCN SD5_HOXD13 +chr2 190880873 190880920 GCA GDPAG_GLS +chr3 63912685 63912715 CAG SCA7_ATXN7 +chr3 129172577 129172656 CAGG DM2_CNBP +chr3 138946020 138946062 NGC BPES_FOXL2 +chr3 183712188 183712226 TTTCA,TTTTA FAME4_YEATS2 +chr4 3074877 3074933 CAG HD_HTT +chr4 39348425 39348483 AAGGG,ACAGG,AGGGC,AAGGC,AGAGG,AAAAG,AAAGG,AAGAG,AAAGGG CANVAS_RFC1 +chr4 41745973 41746032 GCN CCHS_PHOX2B +chr4 159342527 159342618 TTTCA,TTTTA FAME7_RAPGEF2 +chr5 10356344 10356411 TTTCA,TTTTA FAME3_MARCHF6 +chr5 146878728 146878759 GCT SCA12_PPP2R2B +chr6 16327634 16327724 CTG SCA1_ATXN1 +chr6 45422751 45422801 GCN CCD_RUNX2 +chr6 170561907 170562017 GCA SCA17_TBP +chr7 27199679 27199732 NGC HFG_HOXA13-III +chr7 27199826 27199861 NGC HFG_HOXA13-II +chr7 27199925 27199966 NGC HFG_HOXA13-I +chr7 55887601 55887639 GCG FRA7A_ZNF713 +chr8 104588971 104588999 CGC OPDM1_LRP12 +chr8 118366813 118366918 TGAAA,TAAAA FAME1_SAMD12 +chr9 27573485 27573546 GGCCCC FTDALS1_C9orf72 +chr9 69037287 69037304 GAA FRDA_FXN +chr9 130681606 130681641 GCC HSAN-VIII_PRDM12 +chr10 79826384 79826404 GGC OPML1_NUTM2B-AS1 +chr11 119206290 119206323 CGG JBS_CBL +chr12 6936717 6936775 CAG DRPLA_ATN1 +chr12 50505002 50505024 GGC FRA12A_DIP2B +chr12 111598950 111599019 CTG SCA2_ATXN2 +chr12 123533721 123533750 GGC OPDM4_RILPL1 +chr13 70139384 70139429 CTG SCA8_ATXN8OS +chr13 99985449 99985494 GCN HPE5_ZIC2 +chr13 102161575 102161726 GAA,GAAGGA,GAAGAAGAAGAAGCA,AAGGAG SCA27B_FGF14 +chr14 23321473 23321503 GCN OPMD_PABPN1 +chr14 92071011 92071052 CTG SCA3_ATXN3 +chr15 22786678 22786703 GCG ALS1_NIPA1 +chr15 88569434 88569452 TTTG MIR7-2_CHNG3 +chr15 89333589 89333629 GCT CPEO_POLG +chr16 17470908 17470922 GCC DBQD2_XYLT1 +chr16 24613439 24613532 TTTCA,TTTTA FAME6_TNRC6A +chr16 66490397 66490466 TGGAA,TAGAA,AATAA SCA31_BEAN1 +chr16 67842863 67842950 CAG SCA_THAP11 +chr16 72787695 72787758 GCC SCA4_ZFHX3 +chr16 87604283 87604329 CTG HDL2_JPH3 +chr17 17808359 17808460 TTTCA,TTTTA FAME8_RAI1 +chr17 80147010 80147139 CCTCGCTGTGCCGCTGCCGA RCPS_EIF4A3 +chr18 666892 667632 GATGGT CPUM_TYMS +chr18 55586154 55586229 CAG FECD3_TCF4 +chr19 4510728 4513659 TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC MRUPAV_PLIN4 +chr19 13207859 13207898 CTG SCA6_CACNA1A +chr19 14496042 14496075 CCG OPDM2_GIPC1 +chr19 18786035 18786050 GTC EDM1-PSACH_COMP +chr19 45770205 45770266 CAG DM1_DMPK +chr20 2652733 2652757 GGCCTG SCA36_NOP56 +chr20 4699398 4699493 CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT CJD_PRNP +chr21 43776443 43776479 CGCGGGGCGGGG EPM1_CSTB +chr22 19766763 19766807 GCN TOF_TBX1 +chr22 45795355 45795424 ATTCT SCA10_ATXN10 +chrX 25013530 25013565 NGC PRTS_ARX +chrX 25013650 25013697 NGC EIEE1_ARX +chrX 31284558 31284605 TTC DMD_DMD +chrX 67545317 67545419 GCA SBMA_AR +chrX 71453055 71453131 AGAGGG XDP_TAF1 +chrX 137566827 137566856 GCN VACTERLX_ZIC3 +chrX 140504317 140504361 NGC XLMR_SOX3 +chrX 147912050 147912111 CGG FXS_FMR1 +chrX 148500605 148500753 GCC FRAXE_AFF2 diff --git a/data/catalogs/STRchive-disease-loci.hg38.straglr.bed b/data/catalogs/STRchive-disease-loci.hg38.straglr.bed new file mode 100644 index 00000000..c9f37b8b --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.hg38.straglr.bed @@ -0,0 +1,86 @@ +chr1 1435798 1435818 GGCGCGGAGC HMNR7_VWA1 HMNR7_VWA1 +chr1 57367008 57367043 AAAAT SCA37_DAB1 SCA37_DAB1_AAAAT +chr1 57367043 57367121 GAAAT SCA37_DAB1 SCA37_DAB1 +chr1 94418421 94418444 GCC OPDM5_ABCD3 OPDM5_ABCD3 +chr1 149390802 149390842 GGC NIID_NOTCH2NLC NIID_NOTCH2NLC +chr1 156591765 156591783 GGGCC NME_NAXE NME_NAXE +chr2 96197066 96197124 AAATG FAME2_STARD7 FAME2_STARD7 +chr2 100104798 100104824 GCC FRA2A_AFF3 FRA2A_AFF3 +chr2 176093058 176093103 GCN SD5_HOXD13 SD5_HOXD13 +chr2 190880872 190880920 GCA GDPAG_GLS GDPAG_GLS +chr3 63912684 63912715 CAG SCA7_ATXN7 SCA7_ATXN7 +chr3 63912715 63912727 CCG SCA7_ATXN7 SCA7_ATXN7_CCG +chr3 129172576 129172656 CAGG DM2_CNBP DM2_CNBP +chr3 129172656 129172696 CAGA DM2_CNBP DM2_CNBP_CAGA +chr3 129172696 129172734 CA DM2_CNBP DM2_CNBP_CA +chr3 138946019 138946062 NGC BPES_FOXL2 BPES_FOXL2 +chr3 183712187 183712226 TTTCA FAME4_YEATS2 FAME4_YEATS2 +chr4 3074876 3074933 CAG HD_HTT HD_HTT +chr4 3074939 3074975 CCG HD_HTT HD_HTT_CCG +chr4 39348424 39348483 AAGGG CANVAS_RFC1 CANVAS_RFC1 +chr4 41745972 41746032 GCN CCHS_PHOX2B CCHS_PHOX2B +chr4 159342526 159342618 TTTCA FAME7_RAPGEF2 FAME7_RAPGEF2 +chr5 10356343 10356411 TTTCA FAME3_MARCHF6 FAME3_MARCHF6 +chr5 146878727 146878759 GCT SCA12_PPP2R2B SCA12_PPP2R2B +chr6 16327633 16327724 CTG SCA1_ATXN1 SCA1_ATXN1 +chr6 45422750 45422801 GCN CCD_RUNX2 CCD_RUNX2 +chr6 170561906 170562017 GCA SCA17_TBP SCA17_TBP +chr7 27199678 27199732 NGC HFG_HOXA13-III HFG_HOXA13-III +chr7 27199825 27199861 NGC HFG_HOXA13-II HFG_HOXA13-II +chr7 27199924 27199966 NGC HFG_HOXA13-I HFG_HOXA13-I +chr7 55887600 55887639 GCG FRA7A_ZNF713 FRA7A_ZNF713 +chr8 104588970 104588999 CGC OPDM1_LRP12 OPDM1_LRP12 +chr8 118366812 118366918 TGAAA FAME1_SAMD12 FAME1_SAMD12 +chr9 27573484 27573546 GGCCCC FTDALS1_C9orf72 FTDALS1_C9orf72 +chr9 69037270 69037286 A FRDA_FXN FRDA_FXN_A +chr9 69037286 69037304 GAA FRDA_FXN FRDA_FXN +chr9 130681605 130681641 GCC HSAN-VIII_PRDM12 HSAN-VIII_PRDM12 +chr10 79826383 79826404 GGC OPML1_NUTM2B-AS1 OPML1_NUTM2B-AS1 +chr11 119206289 119206323 CGG JBS_CBL JBS_CBL +chr12 6936716 6936775 CAG DRPLA_ATN1 DRPLA_ATN1 +chr12 50505001 50505024 GGC FRA12A_DIP2B FRA12A_DIP2B +chr12 111598949 111599019 CTG SCA2_ATXN2 SCA2_ATXN2 +chr12 123533720 123533750 GGC OPDM4_RILPL1 OPDM4_RILPL1 +chr13 70139353 70139383 CTA SCA8_ATXN8OS SCA8_ATXN8OS_CTA +chr13 70139383 70139429 CTG SCA8_ATXN8OS SCA8_ATXN8OS +chr13 99985448 99985494 GCN HPE5_ZIC2 HPE5_ZIC2 +chr13 102161574 102161726 GAA SCA27B_FGF14 SCA27B_FGF14 +chr14 23321472 23321503 GCN OPMD_PABPN1 OPMD_PABPN1 +chr14 92071010 92071052 CTG SCA3_ATXN3 SCA3_ATXN3 +chr15 22786677 22786703 GCG ALS1_NIPA1 ALS1_NIPA1 +chr15 88569433 88569452 TTTG MIR7-2_CHNG3 MIR7-2_CHNG3 +chr15 89333579 89333585 GCT CPEO_POLG CPEO_POLG_GCT +chr15 89333588 89333629 GCT CPEO_POLG CPEO_POLG +chr16 17470862 17470883 GCC DBQD2_XYLT1 DBQD2_XYLT1_GCC +chr16 17470907 17470922 GCC DBQD2_XYLT1 DBQD2_XYLT1 +chr16 24613438 24613532 TTTCA FAME6_TNRC6A FAME6_TNRC6A +chr16 66490396 66490466 TGGAA SCA31_BEAN1 SCA31_BEAN1 +chr16 67842862 67842950 CAG SCA_THAP11 SCA_THAP11 +chr16 72787694 72787758 GCC SCA4_ZFHX3 SCA4_ZFHX3 +chr16 87604282 87604329 CTG HDL2_JPH3 HDL2_JPH3 +chr17 17808358 17808460 TTTCA FAME8_RAI1 FAME8_RAI1 +chr17 80147009 80147139 CCTCGCTGTGCCGCTGCCGA RCPS_EIF4A3 RCPS_EIF4A3 +chr18 666891 667632 GATGGT CPUM_TYMS CPUM_TYMS +chr18 55586153 55586229 CAG FECD3_TCF4 FECD3_TCF4 +chr19 4510727 4513659 TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC MRUPAV_PLIN4 MRUPAV_PLIN4 +chr19 13207858 13207898 CTG SCA6_CACNA1A SCA6_CACNA1A +chr19 14496041 14496075 CCG OPDM2_GIPC1 OPDM2_GIPC1 +chr19 18786034 18786050 GTC EDM1-PSACH_COMP EDM1-PSACH_COMP +chr19 45770204 45770266 CAG DM1_DMPK DM1_DMPK +chr20 2652732 2652757 GGCCTG SCA36_NOP56 SCA36_NOP56 +chr20 2652757 2652775 CGCCTG SCA36_NOP56 SCA36_NOP56_CGCCTG +chr20 4699370 4699397 CCTCAGGGCGGTGGTGGCTGGGGGCAG CJD_PRNP CJD_PRNP_CCTCAGGGCGGTGGTGGCTGGGGGCAG +chr20 4699397 4699493 CCTCATGGTGGTGGCTGGGGGCAG CJD_PRNP CJD_PRNP +chr21 43776442 43776479 CGCGGGGCGGGG EPM1_CSTB EPM1_CSTB +chr22 19766762 19766807 GCN TOF_TBX1 TOF_TBX1 +chr22 45795354 45795424 ATTCT SCA10_ATXN10 SCA10_ATXN10 +chrX 25013529 25013565 NGC PRTS_ARX PRTS_ARX +chrX 25013649 25013697 NGC EIEE1_ARX EIEE1_ARX +chrX 31284557 31284605 TTC DMD_DMD DMD_DMD +chrX 31284605 31284613 T DMD_DMD DMD_DMD_T +chrX 67545316 67545419 GCA SBMA_AR SBMA_AR +chrX 71453054 71453131 AGAGGG XDP_TAF1 XDP_TAF1 +chrX 137566826 137566856 GCN VACTERLX_ZIC3 VACTERLX_ZIC3 +chrX 140504316 140504361 NGC XLMR_SOX3 XLMR_SOX3 +chrX 147912049 147912111 CGG FXS_FMR1 FXS_FMR1 +chrX 148500604 148500753 GCC FRAXE_AFF2 FRAXE_AFF2 diff --git a/data/catalogs/STRchive-disease-loci.hg38.stranger.json b/data/catalogs/STRchive-disease-loci.hg38.stranger.json new file mode 100644 index 00000000..b1687baa --- /dev/null +++ b/data/catalogs/STRchive-disease-loci.hg38.stranger.json @@ -0,0 +1,985 @@ +[ +{ + "LocusId": "HMNR7_VWA1", + "ReferenceRegion": "chr1:1435798-1435818", + "LocusStructure": "(GGCGCGGAGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GGCGCGGAGC", + "Disease": "HMNR7", + "NormalMax": 2.0, + "PathologicMin": 3.0, + "Gene": "VWA1" +}, +{ + "LocusId": "SCA37_DAB1", + "ReferenceRegion": ["chr1:57367008-57367043", "chr1:57367043-57367121"], + "LocusStructure": "(AAAAT)*(GAAAT)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA37_DAB1_AAAAT", "SCA37_DAB1"], + "PathologicRegion": "chr1:57367043-57367121", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GAAAT", + "Disease": "SCA37", + "NormalMax": 30.0, + "PathologicMin": 31.0, + "Gene": "DAB1" +}, +{ + "LocusId": "OPDM5_ABCD3", + "ReferenceRegion": "chr1:94418421-94418444", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCC", + "Disease": "OPDM5", + "NormalMax": 44.0, + "PathologicMin": 118.0, + "Gene": "ABCD3" +}, +{ + "LocusId": "NIID_NOTCH2NLC", + "ReferenceRegion": "chr1:149390802-149390842", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "NIID", + "NormalMax": 37.0, + "PathologicMin": 66.0, + "Gene": "NOTCH2NLC" +}, +{ + "LocusId": "NME_NAXE", + "ReferenceRegion": "chr1:156591765-156591783", + "LocusStructure": "(GGGCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GGGCC", + "Disease": "NME", + "NormalMax": 7.0, + "PathologicMin": 200.0, + "Gene": "NAXE" +}, +{ + "LocusId": "FAME2_STARD7", + "ReferenceRegion": "chr2:96197066-96197124", + "LocusStructure": "(AAATG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "AAATG", + "Disease": "FAME2", + "NormalMax": 273.0, + "PathologicMin": 274.0, + "Gene": "STARD7" +}, +{ + "LocusId": "FRA2A_AFF3", + "ReferenceRegion": "chr2:100104798-100104824", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCC", + "Disease": "FRA2A", + "NormalMax": 20.0, + "PathologicMin": 300.0, + "Gene": "AFF3" +}, +{ + "LocusId": "SD5_HOXD13", + "ReferenceRegion": "chr2:176093058-176093103", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "SD5", + "NormalMax": 15.0, + "PathologicMin": 22.0, + "Gene": "HOXD13" +}, +{ + "LocusId": "GDPAG_GLS", + "ReferenceRegion": "chr2:190880872-190880920", + "LocusStructure": "(GCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GCA", + "Disease": "GDPAG", + "NormalMax": 38.0, + "PathologicMin": 680.0, + "Gene": "GLS" +}, +{ + "LocusId": "SCA7_ATXN7", + "ReferenceRegion": ["chr3:63912684-63912715", "chr3:63912715-63912727"], + "LocusStructure": "(CAG)*(CCG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA7_ATXN7", "SCA7_ATXN7_CCG"], + "PathologicRegion": "chr3:63912684-63912715", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "SCA7", + "NormalMax": 27.0, + "PathologicMin": 37.0, + "Gene": "ATXN7" +}, +{ + "LocusId": "DM2_CNBP", + "ReferenceRegion": ["chr3:129172576-129172656", "chr3:129172656-129172696", "chr3:129172696-129172734"], + "LocusStructure": "(CAGG)*(CAGA)*(CA)*", + "VariantType": ["Repeat", "Repeat", "Repeat"], + "VariantId": ["DM2_CNBP", "DM2_CNBP_CAGA", "DM2_CNBP_CA"], + "PathologicRegion": "chr3:129172576-129172656", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAGG", + "Disease": "DM2", + "NormalMax": 26.0, + "PathologicMin": 75.0, + "Gene": "CNBP" +}, +{ + "LocusId": "BPES_FOXL2", + "ReferenceRegion": "chr3:138946019-138946062", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD", "AR"], + "DisplayRU": "NGC", + "Disease": "BPES", + "NormalMax": 14.0, + "PathologicMin": 15.0, + "Gene": "FOXL2" +}, +{ + "LocusId": "FAME4_YEATS2", + "ReferenceRegion": "chr3:183712187-183712226", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME4", + "NormalMax": 999.0, + "PathologicMin": 1000.0, + "Gene": "YEATS2" +}, +{ + "LocusId": "HD_HTT", + "ReferenceRegion": ["chr4:3074876-3074933", "chr4:3074939-3074975"], + "LocusStructure": "(CAG)*CAACAG(CCG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["HD_HTT", "HD_HTT_CCG"], + "PathologicRegion": "chr4:3074876-3074933", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "HD", + "NormalMax": 26.0, + "PathologicMin": 36.0, + "Gene": "HTT" +}, +{ + "LocusId": "CANVAS_RFC1", + "ReferenceRegion": "chr4:39348424-39348483", + "LocusStructure": "(AAGGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "AAGGG", + "Disease": "CANVAS", + "NormalMax": 11.0, + "PathologicMin": 400.0, + "Gene": "RFC1" +}, +{ + "LocusId": "CCHS_PHOX2B", + "ReferenceRegion": "chr4:41745972-41746032", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "CCHS", + "NormalMax": 20.0, + "PathologicMin": 26.0, + "Gene": "PHOX2B" +}, +{ + "LocusId": "FAME7_RAPGEF2", + "ReferenceRegion": "chr4:159342526-159342618", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME7", + "NormalMax": 59.0, + "PathologicMin": 60.0, + "Gene": "RAPGEF2" +}, +{ + "LocusId": "FAME3_MARCHF6", + "ReferenceRegion": "chr5:10356343-10356411", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME3", + "NormalMax": 790.0, + "PathologicMin": 791.0, + "Gene": "MARCHF6" +}, +{ + "LocusId": "SCA12_PPP2R2B", + "ReferenceRegion": "chr5:146878727-146878759", + "LocusStructure": "(GCT)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCT", + "Disease": "SCA12", + "NormalMax": 32.0, + "PathologicMin": 51.0, + "Gene": "PPP2R2B" +}, +{ + "LocusId": "SCA1_ATXN1", + "ReferenceRegion": "chr6:16327633-16327724", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA1", + "NormalMax": 35.0, + "PathologicMin": 39.0, + "Gene": "ATXN1" +}, +{ + "LocusId": "CCD_RUNX2", + "ReferenceRegion": "chr6:45422750-45422801", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "CCD", + "NormalMax": 17.0, + "PathologicMin": 20.0, + "Gene": "RUNX2" +}, +{ + "LocusId": "SCA17_TBP", + "ReferenceRegion": "chr6:170561906-170562017", + "LocusStructure": "(GCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCA", + "Disease": "SCA17", + "NormalMax": 40.0, + "PathologicMin": 49.0, + "Gene": "TBP" +}, +{ + "LocusId": "HFG_HOXA13-III", + "ReferenceRegion": "chr7:27199678-27199732", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "NGC", + "Disease": "HFG-III", + "NormalMax": 18.0, + "PathologicMin": 22.0, + "Gene": "HOXA13" +}, +{ + "LocusId": "HFG_HOXA13-II", + "ReferenceRegion": "chr7:27199825-27199861", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "NGC", + "Disease": "HFG-II", + "NormalMax": 12.0, + "PathologicMin": 18.0, + "Gene": "HOXA13" +}, +{ + "LocusId": "HFG_HOXA13-I", + "ReferenceRegion": "chr7:27199924-27199966", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "NGC", + "Disease": "HFG-I", + "NormalMax": 14.0, + "PathologicMin": 22.0, + "Gene": "HOXA13" +}, +{ + "LocusId": "FRA7A_ZNF713", + "ReferenceRegion": "chr7:55887600-55887639", + "LocusStructure": "(GCG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCG", + "Disease": "FRA7A", + "NormalMax": 22.0, + "PathologicMin": 450.0, + "Gene": "ZNF713" +}, +{ + "LocusId": "OPDM1_LRP12", + "ReferenceRegion": "chr8:104588970-104588999", + "LocusStructure": "(CGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CGC", + "Disease": "OPDM1", + "NormalMax": 45.0, + "PathologicMin": 85.0, + "Gene": "LRP12" +}, +{ + "LocusId": "FAME1_SAMD12", + "ReferenceRegion": "chr8:118366812-118366918", + "LocusStructure": "(TGAAA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TGAAA", + "Disease": "FAME1", + "NormalMax": 104.0, + "PathologicMin": 105.0, + "Gene": "SAMD12" +}, +{ + "LocusId": "FTDALS1_C9orf72", + "ReferenceRegion": "chr9:27573484-27573546", + "LocusStructure": "(GGCCCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGCCCC", + "Disease": "FTDALS1", + "NormalMax": 23.0, + "PathologicMin": 251.0, + "Gene": "C9orf72" +}, +{ + "LocusId": "FRDA_FXN", + "ReferenceRegion": ["chr9:69037270-69037286", "chr9:69037286-69037304"], + "LocusStructure": "(A)*(GAA)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["FRDA_FXN_A", "FRDA_FXN"], + "PathologicRegion": "chr9:69037286-69037304", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GAA", + "Disease": "FRDA", + "NormalMax": 33.0, + "PathologicMin": 56.0, + "Gene": "FXN" +}, +{ + "LocusId": "HSAN-VIII_PRDM12", + "ReferenceRegion": "chr9:130681605-130681641", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GCC", + "Disease": "HSAN VIII", + "NormalMax": 14.0, + "PathologicMin": 18.0, + "Gene": "PRDM12" +}, +{ + "LocusId": "OPML1_NUTM2B-AS1", + "ReferenceRegion": "chr10:79826383-79826404", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "OPML1", + "NormalMax": 16.0, + "PathologicMin": 161.0, + "Gene": "NUTM2B-AS1" +}, +{ + "LocusId": "JBS_CBL", + "ReferenceRegion": "chr11:119206289-119206323", + "LocusStructure": "(CGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CGG", + "Disease": "JBS", + "NormalMax": 79.0, + "PathologicMin": 101.0, + "Gene": "CBL" +}, +{ + "LocusId": "DRPLA_ATN1", + "ReferenceRegion": "chr12:6936716-6936775", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "DRPLA", + "NormalMax": 35.0, + "PathologicMin": 48.0, + "Gene": "ATN1" +}, +{ + "LocusId": "FRA12A_DIP2B", + "ReferenceRegion": "chr12:50505001-50505024", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "FRA12A", + "NormalMax": 23.0, + "PathologicMin": 273.0, + "Gene": "DIP2B" +}, +{ + "LocusId": "SCA2_ATXN2", + "ReferenceRegion": "chr12:111598949-111599019", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD", "AR"], + "DisplayRU": "CTG", + "Disease": "SCA2", + "NormalMax": 30.0, + "PathologicMin": 35.0, + "Gene": "ATXN2" +}, +{ + "LocusId": "OPDM4_RILPL1", + "ReferenceRegion": "chr12:123533720-123533750", + "LocusStructure": "(GGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGC", + "Disease": "OPDM4", + "NormalMax": 16.0, + "PathologicMin": 120.0, + "Gene": "RILPL1" +}, +{ + "LocusId": "SCA8_ATXN8OS", + "ReferenceRegion": ["chr13:70139353-70139383", "chr13:70139383-70139429"], + "LocusStructure": "(CTA)*(CTG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA8_ATXN8OS_CTA", "SCA8_ATXN8OS"], + "PathologicRegion": "chr13:70139383-70139429", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA8", + "NormalMax": 50.0, + "PathologicMin": 71.0, + "Gene": "ATXN8OS" +}, +{ + "LocusId": "HPE5_ZIC2", + "ReferenceRegion": "chr13:99985448-99985494", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "HPE5", + "NormalMax": 15.0, + "PathologicMin": 25.0, + "Gene": "ZIC2" +}, +{ + "LocusId": "SCA27B_FGF14", + "ReferenceRegion": "chr13:102161574-102161726", + "LocusStructure": "(GAA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GAA", + "Disease": "SCA27B", + "NormalMax": 179.0, + "PathologicMin": 320.0, + "Gene": "FGF14" +}, +{ + "LocusId": "OPMD_PABPN1", + "ReferenceRegion": "chr14:23321472-23321503", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD", "AR"], + "DisplayRU": "GCN", + "Disease": "OPMD", + "NormalMax": 10.0, + "PathologicMin": 12.0, + "Gene": "PABPN1" +}, +{ + "LocusId": "SCA3_ATXN3", + "ReferenceRegion": "chr14:92071010-92071052", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA3, MJD", + "NormalMax": 44.0, + "PathologicMin": 60.0, + "Gene": "ATXN3" +}, +{ + "LocusId": "ALS1_NIPA1", + "ReferenceRegion": "chr15:22786677-22786703", + "LocusStructure": "(GCG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCG", + "Disease": "ALS1", + "NormalMax": 10.0, + "PathologicMin": 11.0, + "Gene": "NIPA1" +}, +{ + "LocusId": "MIR7-2_CHNG3", + "ReferenceRegion": "chr15:88569433-88569452", + "LocusStructure": "(TTTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTG", + "Disease": "CHNG3", + "NormalMax": 4.0, + "PathologicMin": 5.0, + "Gene": "MIR7-2" +}, +{ + "LocusId": "CPEO_POLG", + "ReferenceRegion": ["chr15:89333579-89333585", "chr15:89333588-89333629"], + "LocusStructure": "(GCT)*GTT(GCT)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["CPEO_POLG_GCT", "CPEO_POLG"], + "PathologicRegion": "chr15:89333588-89333629", + "HGNCId": null, + "InheritanceMode": [], + "DisplayRU": "GCT", + "Disease": "CPEO", + "NormalMax": 10.0, + "PathologicMin": 11.0, + "Gene": "POLG" +}, +{ + "LocusId": "DBQD2_XYLT1", + "ReferenceRegion": ["chr16:17470862-17470883", "chr16:17470907-17470922"], + "LocusStructure": "(GCC)*TCGGCTCGCCGCTGCTCCTCCTCC(GCC)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["DBQD2_XYLT1_GCC", "DBQD2_XYLT1"], + "PathologicRegion": "chr16:17470907-17470922", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GCC", + "Disease": "DBQD2, BSS", + "NormalMax": 20.0, + "PathologicMin": 72.0, + "Gene": "XYLT1" +}, +{ + "LocusId": "FAME6_TNRC6A", + "ReferenceRegion": "chr16:24613438-24613532", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME6", + "NormalMax": 1099.0, + "PathologicMin": 1100.0, + "Gene": "TNRC6A" +}, +{ + "LocusId": "SCA31_BEAN1", + "ReferenceRegion": "chr16:66490396-66490466", + "LocusStructure": "(TGGAA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TGGAA", + "Disease": "SCA31", + "NormalMax": 109.0, + "PathologicMin": 110.0, + "Gene": "BEAN1" +}, +{ + "LocusId": "SCA_THAP11", + "ReferenceRegion": "chr16:67842862-67842950", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "SCA", + "NormalMax": 38.0, + "PathologicMin": 45.0, + "Gene": "THAP11" +}, +{ + "LocusId": "SCA4_ZFHX3", + "ReferenceRegion": "chr16:72787694-72787758", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCC", + "Disease": "SCA4", + "NormalMax": 26.0, + "PathologicMin": 46.0, + "Gene": "ZFHX3" +}, +{ + "LocusId": "HDL2_JPH3", + "ReferenceRegion": "chr16:87604282-87604329", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "HDL2", + "NormalMax": 28.0, + "PathologicMin": 40.0, + "Gene": "JPH3" +}, +{ + "LocusId": "FAME8_RAI1", + "ReferenceRegion": "chr17:17808358-17808460", + "LocusStructure": "(TTTCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TTTCA", + "Disease": "FAME8", + "NormalMax": 8.0, + "PathologicMin": 9.0, + "Gene": "RAI1" +}, +{ + "LocusId": "RCPS_EIF4A3", + "ReferenceRegion": "chr17:80147009-80147139", + "LocusStructure": "(CCTCGCTGTGCCGCTGCCGA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "CCTCGCTGTGCCGCTGCCGA", + "Disease": "RCPS", + "NormalMax": 12.0, + "PathologicMin": 14.0, + "Gene": "EIF4A3" +}, +{ + "LocusId": "CPUM_TYMS", + "ReferenceRegion": "chr18:666891-667632", + "LocusStructure": "(GATGGT)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "GATGGT", + "Disease": "CPUM", + "NormalMax": 172, + "PathologicMin": 210, + "Gene": "TYMS" +}, +{ + "LocusId": "FECD3_TCF4", + "ReferenceRegion": "chr18:55586153-55586229", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "FECD3", + "NormalMax": 39.0, + "PathologicMin": 51.0, + "Gene": "TCF4" +}, +{ + "LocusId": "MRUPAV_PLIN4", + "ReferenceRegion": "chr19:4510727-4513659", + "LocusStructure": "(TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "TGGTGTCCACGCCGGTCTGGATGGTTCCTTTGGCCACATTCATGGCACCAGTCACCCCACTACAGACGGTGTCCTTGGTACCTGTTAGGACAGTCTTAC", + "Disease": "MRUPAV", + "NormalMax": 31.0, + "PathologicMin": 37.0, + "Gene": "PLIN4" +}, +{ + "LocusId": "SCA6_CACNA1A", + "ReferenceRegion": "chr19:13207858-13207898", + "LocusStructure": "(CTG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CTG", + "Disease": "SCA6", + "NormalMax": 18.0, + "PathologicMin": 21.0, + "Gene": "CACNA1A" +}, +{ + "LocusId": "OPDM2_GIPC1", + "ReferenceRegion": "chr19:14496041-14496075", + "LocusStructure": "(CCG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CCG", + "Disease": "OPDM2", + "NormalMax": 32.0, + "PathologicMin": 73.0, + "Gene": "GIPC1" +}, +{ + "LocusId": "EDM1-PSACH_COMP", + "ReferenceRegion": "chr19:18786034-18786050", + "LocusStructure": "(GTC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GTC", + "Disease": "EDM1, PSACH", + "NormalMax": 5.0, + "PathologicMin": 6.0, + "Gene": "COMP" +}, +{ + "LocusId": "DM1_DMPK", + "ReferenceRegion": "chr19:45770204-45770266", + "LocusStructure": "(CAG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CAG", + "Disease": "DM1", + "NormalMax": 34.0, + "PathologicMin": 50.0, + "Gene": "DMPK" +}, +{ + "LocusId": "SCA36_NOP56", + "ReferenceRegion": ["chr20:2652732-2652757", "chr20:2652757-2652775"], + "LocusStructure": "(GGCCTG)*(CGCCTG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["SCA36_NOP56", "SCA36_NOP56_CGCCTG"], + "PathologicRegion": "chr20:2652732-2652757", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GGCCTG", + "Disease": "SCA36", + "NormalMax": 14.0, + "PathologicMin": 650.0, + "Gene": "NOP56" +}, +{ + "LocusId": "CJD_PRNP", + "ReferenceRegion": ["chr20:4699370-4699397", "chr20:4699397-4699493"], + "LocusStructure": "(CCTCAGGGCGGTGGTGGCTGGGGGCAG)*(CCTCATGGTGGTGGCTGGGGGCAG)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["CJD_PRNP_CCTCAGGGCGGTGGTGGCTGGGGGCAG", "CJD_PRNP"], + "PathologicRegion": "chr20:4699397-4699493", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "CCTCATGGTGGTGGCTGGGGGCAG", + "Disease": "CJD", + "NormalMax": 4.0, + "PathologicMin": 5.0, + "Gene": "PRNP" +}, +{ + "LocusId": "EPM1_CSTB", + "ReferenceRegion": "chr21:43776442-43776479", + "LocusStructure": "(CGCGGGGCGGGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AR"], + "DisplayRU": "CGCGGGGCGGGG", + "Disease": "EPM1", + "NormalMax": 3.0, + "PathologicMin": 30.0, + "Gene": "CSTB" +}, +{ + "LocusId": "TOF_TBX1", + "ReferenceRegion": "chr22:19766762-19766807", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "GCN", + "Disease": "TOF", + "NormalMax": 15.0, + "PathologicMin": 25.0, + "Gene": "TBX1" +}, +{ + "LocusId": "SCA10_ATXN10", + "ReferenceRegion": "chr22:45795354-45795424", + "LocusStructure": "(ATTCT)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["AD"], + "DisplayRU": "ATTCT", + "Disease": "SCA10", + "NormalMax": 32.0, + "PathologicMin": 800.0, + "Gene": "ATXN10" +}, +{ + "LocusId": "PRTS_ARX", + "ReferenceRegion": "chrX:25013529-25013565", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "NGC", + "Disease": "PRTS", + "NormalMax": 12.0, + "PathologicMin": 20.0, + "Gene": "ARX" +}, +{ + "LocusId": "EIEE1_ARX", + "ReferenceRegion": "chrX:25013649-25013697", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "NGC", + "Disease": "EIEE1", + "NormalMax": 16.0, + "PathologicMin": 17.0, + "Gene": "ARX" +}, +{ + "LocusId": "DMD_DMD", + "ReferenceRegion": ["chrX:31284557-31284605", "chrX:31284605-31284613"], + "LocusStructure": "(TTC)*(T)*", + "VariantType": ["Repeat", "Repeat"], + "VariantId": ["DMD_DMD", "DMD_DMD_T"], + "PathologicRegion": "chrX:31284557-31284605", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "TTC", + "Disease": "DMD", + "NormalMax": 33.0, + "PathologicMin": 59.0, + "Gene": "DMD" +}, +{ + "LocusId": "SBMA_AR", + "ReferenceRegion": "chrX:67545316-67545419", + "LocusStructure": "(GCA)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "GCA", + "Disease": "SBMA", + "NormalMax": 34.0, + "PathologicMin": 38.0, + "Gene": "AR" +}, +{ + "LocusId": "XDP_TAF1", + "ReferenceRegion": "chrX:71453054-71453131", + "LocusStructure": "(AGAGGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "AGAGGG", + "Disease": "XDP", + "NormalMax": 34.0, + "PathologicMin": 35.0, + "Gene": "TAF1" +}, +{ + "LocusId": "VACTERLX_ZIC3", + "ReferenceRegion": "chrX:137566826-137566856", + "LocusStructure": "(GCN)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "GCN", + "Disease": "VACTERLX", + "NormalMax": 10.0, + "PathologicMin": 12.0, + "Gene": "ZIC3" +}, +{ + "LocusId": "XLMR_SOX3", + "ReferenceRegion": "chrX:140504316-140504361", + "LocusStructure": "(NGC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "NGC", + "Disease": "XLMR", + "NormalMax": 15.0, + "PathologicMin": 22.0, + "Gene": "SOX3" +}, +{ + "LocusId": "FXS_FMR1", + "ReferenceRegion": "chrX:147912049-147912111", + "LocusStructure": "(CGG)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XD"], + "DisplayRU": "CGG", + "Disease": "FXS, FXTAS, POF1", + "NormalMax": 44.0, + "PathologicMin": 201.0, + "Gene": "FMR1" +}, +{ + "LocusId": "FRAXE_AFF2", + "ReferenceRegion": "chrX:148500604-148500753", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "HGNCId": null, + "InheritanceMode": ["XR"], + "DisplayRU": "GCC", + "Disease": "FRAXE", + "NormalMax": 39.0, + "PathologicMin": 201.0, + "Gene": "AFF2" +}] diff --git a/data/ref-alleles/ref-alleles.T2T-chm13.txt b/data/ref-alleles/ref-alleles.T2T-chm13.txt index e391eb3a..add1844d 100644 --- a/data/ref-alleles/ref-alleles.T2T-chm13.txt +++ b/data/ref-alleles/ref-alleles.T2T-chm13.txt @@ -60,15 +60,15 @@ ATCCTAGCGC GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA CCCGCATCCG SCA7_ATXN7 chr3 63956302 63956333 CAG STRchive -chr3 63956302 63956345 CAG,CCG TRGT -CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCGCCGCCTCCGCAGCCCCAGC -CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCG CCG CCT CCG CAGCCCCAGC +chr3 63956302 63956333 CAG,CCG TRGT +CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCGCCGCCTC +CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCGCCGCCTC DM2_CNBP chr3 131917482 131917557 CAGG STRchive -chr3 131917482 131917635 CAGG,CAGA,CA TRGT -GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CAGC CAGG CAGG CAGG CAGG CAGG CAGG CAG ACAGACAGACAGACAGACAGACAGACACACACACACACACACACACACACACACACACACACACACACTGGCAGTAATACTCATTCAC -GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CA GC CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGA CAGA CAGA CAGA CAGA CAGA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CTGG CA GTAAT ACTCATTCAC +chr3 131917482 131917557 CAGG,CAGA,CA TRGT +GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CAGC CAGG CAGG CAGG CAGG CAGG CAGG CAG ACAGACAGAC +GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CA GC CAGG CAGG CAGG CAGG CAGG CAGG CA G ACAGACAGAC BPES_FOXL2 chr3 141687011 141687054 NGC STRchive @@ -78,15 +78,15 @@ CTACCGGGGC C CGC GGC TGC AGC CGC AGC TGC TGC AGC CGC TGC GGC TGC CGC CATCTGGCAG FAME4_YEATS2 chr3 186521667 186521706 TTTTA,TTTCA STRchive -chr3 186521667 186521706 TTTTA,TTTCA TRGT +chr3 186521667 186521706 TTTCA,TTTTA TRGT TTTTATGTTC TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT TTGAGACAGA TTTTATGTTC TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT TTGAGACAGA HD_HTT chr4 3073603 3073687 CAG STRchive -chr4 3073603 3073723 CAG,CCG TRGT -CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAACAGCCGCCACCGCCGCCGCCGCCGCCGCCGCCTCCTCAGCTTC -CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAA CAG CCG CCA CCG CCG CCG CCG CCG CCG CCG CCT CCTCAGCTTC +chr4 3073603 3073687 CAG,CCG TRGT +CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAACAGCCGC +CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAACAGCCGC CANVAS_RFC1 chr4 39318077 39318136 AAAAG,AAGGG,ACAGG,AGGGC,AAGGC,AGAGG STRchive @@ -102,13 +102,13 @@ CAGGCCTCCA GCT GCC GCC GCT GCC GCT GCC GCC GCC GCC GCT GCC GCG GCC GCC GCC GCT G FAME7_RAPGEF2 chr4 162693303 162693405 TTTTA,TTTCA STRchive -chr4 162693303 162693405 TTTTA,TTTCA TRGT +chr4 162693303 162693405 TTTCA,TTTTA TRGT CTATCATAGC TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TATTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TT ACTAGAGGAT CTATCATAGC TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TATTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TT ACTAGAGGAT FAME3_MARCHF6 chr5 10295525 10295593 TTTTA,TTTCA STRchive -chr5 10295525 10295593 TTTTA,TTTCA TRGT +chr5 10295525 10295593 TTTCA,TTTTA TRGT CTGTTTTTTA TTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT CCGAGATGGA CTGTTTTTTA TTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT CCGAGATGGA @@ -168,7 +168,7 @@ AGGTAGACGA CGC CGC CGC CGC CGC CGC CGC CGC CGC CGC CG AGCCACCGGC FAME1_SAMD12 chr8 119495247 119495353 TAAAA,TGAAA STRchive -chr8 119495247 119495353 TAAAA,TGAAA TRGT +chr8 119495247 119495353 TGAAA,TAAAA TRGT ACTCTGTCTC AAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA ATGAACAAAA ACTCTGTCTC AAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA ATGAACAAAA @@ -180,9 +180,9 @@ CGCAACCGCA GCCCCGCCCCGGGCCCGCCCCCGGGCCCGCCCCGACCACGCCCC GGCCCC GGCCCC GGCCCC GGC FRDA_FXN chr9 81210834 81210861 GAA STRchive -chr9 81210818 81210861 A,GAA TRGT -AAAAATACAAAAAAAAAAAAAAAAAA GAA GAA GAA GAA GAA GAA GAA GAA GAA AATAAAGAAA -AAAAATACAA A A A A A A A A A A A A A A A A GAA GAA GAA GAA GAA GAA GAA GAA GAA AATAAAGAAA +chr9 81210834 81210861 A,GAA TRGT +AAAAAAAAAA GAA GAA GAA GAA GAA GAA GAA GAA GAA AATAAAGAAA +AAAAAAAAAA GAA GAA GAA GAA GAA GAA GAA GAA GAA AATAAAGAAA HSAN-VIII_PRDM12 chr9 142886568 142886595 GCC STRchive @@ -228,9 +228,9 @@ CTCCCGAGTG GGC GGC GGC GGC GGC GGC GGC GGC GGC GGC AGCGGGGAGG SCA8_ATXN8OS chr13 69361243 69361270 CTG STRchive -chr13 69361213 69361270 CTA,CTG TRGT -AAACCTGGCTTTACTACTACTACTACTACTACTACTACTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CATTTTTTAA -AAACCTGGCT TTA CTA CTA CTA CTA CTA CTA CTA CTA CTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CATTTTTTAA +chr13 69361243 69361270 CTA,CTG TRGT +ACTACTACTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CATTTTTTAA +ACTACTACTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CATTTTTTAA HPE5_ZIC2 chr13 99196358 99196404 GCN STRchive @@ -282,7 +282,7 @@ TCCCGCTCGG GCC GCC GCC GCC GCC GCC GCC TCGGCTC GCC GCTGCTCCTCCTCC GCC GCC GCC GC FAME6_TNRC6A chr16 24890366 24890430 TTTTA,TTTCA STRchive -chr16 24890366 24890430 TTTTA,TTTCA TRGT +chr16 24890366 24890430 TTTCA,TTTTA TRGT CTATTAAAGC A TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT ACTTATTTAT CTATTAAAGC A TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT ACTTATTTAT @@ -312,13 +312,13 @@ AAGCCAGGGA G CTG C CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG C FAME8_RAI1 chr17 17754961 17755053 TTTTA,TTTCA STRchive -chr17 17754961 17755053 ATTTT,TTTCA,TTTTA TRGT +chr17 17754961 17755053 TTTCA,TTTTA TRGT +TTATTTTTAA A TTTTA TTTTA TTTTA TA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT CATCTCAGAC TTATTTTTAA A TTTTA TTTTA TTTTA TA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT CATCTCAGAC -TTATTTTTAA ATTTT ATTTT ATTTT AT ATTTT ATTTT ATT ATTTT ATTTT ATTTT ATTTT ATTTT ATTTT ATT ATTTT ATTTT ATTTT ATTTT ATTTT ATTT CATCTCAGAC RCPS_EIF4A3 chr17 81047404 81047534 CCTCGCTGTGCCGCTGCCGA STRchive -chr17 81047404 81047534 CCTCGCTGCGCCGCTGCCGA,CCTCGCTGTGCCGCTGCCGA TRGT +chr17 81047404 81047534 CCTCGCTGTGCCGCTGCCGA TRGT CCGACCTCGC TGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA GAACAGACGC CCGACCTCGC TGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA GAACAGACGC @@ -366,15 +366,15 @@ GTGATCCCCC CAG CAG CAG CAG CAG CA TTCCCGGCTA SCA36_NOP56 chr20 2683189 2683230 GGCCTG STRchive -chr20 2683189 2683248 GGCCTG,CGCCTG TRGT -CGCAGACAGA GCCTG GGCCTG GGCCTG GGCCTG GGCCTG GGCCTG GGCCTG CGCCTGCGCCTGCCCTGGGAACGGGTTC -CGCAGACAGA GCCTG GGCCTG GGCCTG GGCCTG GGCCTG GGCCTG GGCCTG CGCCTG CGCCTG CCCTGG GAACGGGTTC +chr20 2683189 2683230 GGCCTG,CGCCTG TRGT +CGCAGACAGA GCCTG GGCCTG GGCCTG GGCCTG GGCCTG GGCCTG GGCCTG CGCCTGCGCC +CGCAGACAGA GCCTG GGCCTG GGCCTG GGCCTG GGCCTG GGCCTG GGCCTG CGCCTGCGCC CJD_PRNP chr20 4738633 4738705 GGTGGTGGCTGGGGGCAGCCTCAT,CCTCATGGTGGTGGCTGGGGGCAG STRchive -chr20 4738606 4738705 CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT TRGT -CCGCTACCCACCTCAGGGCGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA -CCGCTACCCA CCTCAGGGCGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA +chr20 4738633 4738705 CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT TRGT +CTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA +CTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA EPM1_CSTB chr21 42132054 42132091 CGCGGGGCGGGG STRchive @@ -408,9 +408,9 @@ CCGTGGCCGT GGC GGC CGC TGC CGC CGC CGC CGC CGC CGC CGC CGC CGC CGC TGC CGC ACCC DMD_DMD chrX 30882677 30882743 TTC STRchive -chrX 30882677 30882751 TTC,T TRGT -AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTTTTTTTGGCAGAGGTG -AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC T T T T T T T T GGCAGAGGTG +chrX 30882677 30882743 TTC,T TRGT +AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTTTTTTTGG +AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTTTTTTTGG SBMA_AR chrX 65975147 65975250 GCA STRchive diff --git a/data/ref-alleles/ref-alleles.hg19.txt b/data/ref-alleles/ref-alleles.hg19.txt index 307da839..cf691537 100644 --- a/data/ref-alleles/ref-alleles.hg19.txt +++ b/data/ref-alleles/ref-alleles.hg19.txt @@ -60,15 +60,15 @@ ATCCTAGCGC GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA CCCG SCA7_ATXN7 chr3 63898360 63898391 CAG STRchive -chr3 63898360 63898403 CAG,CCG TRGT -CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCGCCGCCTCCGCAGCCCCAGC -CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCG CCG CCT CCG CAGCCCCAGC +chr3 63898360 63898391 CAG,CCG TRGT +CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCGCCGCCTC +CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCGCCGCCTC DM2_CNBP chr3 128891419 128891499 CAGG STRchive -chr3 128891419 128891577 CAGG,CAGA,CA TRGT -GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CAGA CAGG CAGC CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGACAGACAGACAGACAGACAGACAGACAGACAGACAGACACACACACACACACACACACACACACACACACACACTGGCAGTAATA -GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CAGA CAGG CA GC CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGA CAGA CAGA CAGA CAGA CAGA CAGA CAGA CAGA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CT GGCAGTAATA +chr3 128891419 128891499 CAGG,CAGA,CA TRGT +GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CAGA CAGG CAGC CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGACAGACA +GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CAGA CAGG CA GC CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGACAGACA BPES_FOXL2 chr3 138664861 138664904 NGC STRchive @@ -78,15 +78,15 @@ CTACCGGGGC C CGC GGC TGC AGC CGC AGC TGC TGC AGC CGC TGC GGC TGC CGC CATCTGGCAG FAME4_YEATS2 chr3 183429975 183430014 TTTTA,TTTCA STRchive -chr3 183429975 183430014 TTTTA,TTTCA TRGT +chr3 183429975 183430014 TTTCA,TTTTA TRGT TTTTATGTTC TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT TTGAGACAGA TTTTATGTTC TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT TTGAGACAGA HD_HTT chr4 3076603 3076660 CAG STRchive -chr4 3076603 3076696 CAG,CCG TRGT -CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAACAGCCGCCACCGCCGCCGCCGCCGCCGCCGCCTCCTCAGCTTC -CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAA CAG CCG CCA CCG CCG CCG CCG CCG CCG CCG CCT CCTCAGCTTC +chr4 3076603 3076660 CAG,CCG TRGT +CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAACAGCCGC +CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAACAGCCGC CANVAS_RFC1 chr4 39350044 39350103 AAAAG,AAGGG,ACAGG,AGGGC,AAGGC,AGAGG STRchive @@ -102,13 +102,13 @@ CAGGCCTCCA GCT GCC GCC GCT GCC GCT GCC GCC GCC GCC GCT GCC GCG GCC GCC GCC GCT G FAME7_RAPGEF2 chr4 160263678 160263770 TTTTA,TTTCA STRchive -chr4 160263678 160263770 TTTTA,TTTCA TRGT +chr4 160263678 160263770 TTTCA,TTTTA TRGT CTATCATAGC TTTTA TTTTA TTTTA TTTTA TTTTA TATTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TT ACTAGAGGAT CTATCATAGC TTTTA TTTTA TTTTA TTTTA TTTTA TATTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TT ACTAGAGGAT FAME3_MARCHF6 chr5 10356455 10356523 TTTTA,TTTCA STRchive -chr5 10356455 10356523 TTTTA,TTTCA TRGT +chr5 10356455 10356523 TTTCA,TTTTA TRGT CTGTTTTTTA TTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT CCGAGATGGA CTGTTTTTTA TTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT CCGAGATGGA @@ -168,7 +168,7 @@ ACGACGCCGA CGC CGC CGC CGC CGC CGC CGC CGC CGC CG AGCCACCGGC FAME1_SAMD12 chr8 119379051 119379157 TAAAA,TGAAA STRchive -chr8 119379051 119379157 TAAAA,TGAAA TRGT +chr8 119379051 119379157 TGAAA,TAAAA TRGT ACTCTGTCTC AAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA ATGAACAAAA ACTCTGTCTC AAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA ATGAACAAAA @@ -180,9 +180,9 @@ CGCAACCGCA GCCCCGCCCCGGGCCCGCCCCCGGGCCCGCCCCGACCACGCCCC GGCCCC GGCCCC GGCCCC TA FRDA_FXN chr9 71652202 71652220 GAA STRchive -chr9 71652186 71652220 A,GAA TRGT -TAAAAAATACAAAAAAAAAAAAAAAA GAA GAA GAA GAA GAA GAA AATAAAGAAA -TAAAAAATAC A A A A A A A A A A A A A A A A GAA GAA GAA GAA GAA GAA AATAAAGAAA +chr9 71652202 71652220 A,GAA TRGT +AAAAAAAAAA GAA GAA GAA GAA GAA GAA AATAAAGAAA +AAAAAAAAAA GAA GAA GAA GAA GAA GAA AATAAAGAAA HSAN-VIII_PRDM12 chr9 133556992 133557028 GCC STRchive @@ -228,9 +228,9 @@ CTCCCGAGTG GGC GGC GGC GGC GGC GGC GGC GGC GGC GGC AGCGGGGAGG SCA8_ATXN8OS chr13 70713515 70713561 CTG STRchive -chr13 70713485 70713561 CTA,CTG TRGT -CCTGGCTTTACTACTACTACTACTACTACTACTACTACTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG C ATTTTTTAAA -CCTGGCTTTA CTA CTA CTA CTA CTA CTA CTA CTA CTA CTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG C ATTTTTTAAA +chr13 70713515 70713561 CTA,CTG TRGT +ACTACTACTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG C ATTTTTTAAA +ACTACTACTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG C ATTTTTTAAA HPE5_ZIC2 chr13 100637702 100637748 GCN STRchive @@ -282,7 +282,7 @@ TCCCGCTCGG GCC GCC GCC GCC GCC CCCCTCCCCA FAME6_TNRC6A chr16 24624759 24624853 TTTTA,TTTCA STRchive -chr16 24624759 24624853 TTTTA,TTTCA TRGT +chr16 24624759 24624853 TTTCA,TTTTA TRGT CTATTAAAGC A TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT ACTTATTTAT CTATTAAAGC A TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT ACTTATTTAT @@ -312,13 +312,13 @@ AAGCCAGGGA G CTG C CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG TAAG FAME8_RAI1 chr17 17711672 17711774 TTTTA,TTTCA STRchive -chr17 17711672 17711774 ATTTT,TTTCA,TTTTA TRGT +chr17 17711672 17711774 TTTCA,TTTTA TRGT +TTATTTTTAA A TTTTA TTTTA TTTTA TA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT CATCTCAGAC TTATTTTTAA A TTTTA TTTTA TTTTA TA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT CATCTCAGAC -TTATTTTTAA ATTTT ATTTT ATTTT AT ATTTT ATTTT ATT ATTTT ATTTT ATTTT ATTTT ATTTT ATT ATTTT ATTTT ATTTT ATTTT ATTTT ATTTT ATTTT ATTTT ATTT CATCTCAGAC RCPS_EIF4A3 chr17 78120808 78120938 CCTCGCTGTGCCGCTGCCGA STRchive -chr17 78120808 78120938 CCTCGCTGCGCCGCTGCCGA,CCTCGCTGTGCCGCTGCCGA TRGT +chr17 78120808 78120938 CCTCGCTGTGCCGCTGCCGA TRGT CCGACCTCGC TGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA GAACAGACGC CCGACCTCGC TGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA GAACAGACGC @@ -366,15 +366,15 @@ GTGATCCCCC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG C SCA36_NOP56 chr20 2633378 2633403 GGCCTG STRchive -chr20 2633378 2633421 GGCCTG,CGCCTG TRGT -GCCGCAGACA G GGCCTG GGCCTG GGCCTG GGCCTG CGCCTGCGCCTGCGCCTGCCCTGGGAAC -GCCGCAGACA G GGCCTG GGCCTG GGCCTG GGCCTG CGCCTG CGCCTG CGCCTG CCCTGGGAAC +chr20 2633378 2633403 GGCCTG,CGCCTG TRGT +GCCGCAGACA G GGCCTG GGCCTG GGCCTG GGCCTG CGCCTGCGCC +GCCGCAGACA G GGCCTG GGCCTG GGCCTG GGCCTG CGCCTGCGCC CJD_PRNP chr20 4680043 4680139 GGTGGTGGCTGGGGGCAGCCTCAT,CCTCATGGTGGTGGCTGGGGGCAG STRchive -chr20 4680016 4680139 CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT TRGT -CCGCTACCCACCTCAGGGCGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA -CCGCTACCCA CCTCAGGGCGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA +chr20 4680043 4680139 CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT TRGT +CTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA +CTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA EPM1_CSTB chr21 45196323 45196360 CGCGGGGCGGGG STRchive @@ -408,9 +408,9 @@ CCGTGGCCGT GGC GGC CGC TGC CGC CGC CGC CGC CGC CGC CGC CGC CGC CGC TGC CGC ACCC DMD_DMD chrX 31302674 31302722 TTC STRchive -chrX 31302674 31302730 TTC,T TRGT -AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTTTTTTTGGCAGAGGTG -AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC T T T T T T T T GGCAGAGGTG +chrX 31302674 31302722 TTC,T TRGT +AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTTTTTTTGG +AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTTTTTTTGG SBMA_AR chrX 66765158 66765261 GCA STRchive diff --git a/data/ref-alleles/ref-alleles.hg38.txt b/data/ref-alleles/ref-alleles.hg38.txt index e54493d2..41a05c4f 100644 --- a/data/ref-alleles/ref-alleles.hg38.txt +++ b/data/ref-alleles/ref-alleles.hg38.txt @@ -60,15 +60,15 @@ ATCCTAGCGC GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA GCA CCCG SCA7_ATXN7 chr3 63912684 63912715 CAG STRchive -chr3 63912684 63912727 CAG,CCG TRGT -CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCGCCGCCTCCGCAGCCCCAGC -CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCG CCG CCT CCG CAGCCCCAGC +chr3 63912684 63912715 CAG,CCG TRGT +CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCGCCGCCTC +CGGCCGCCCG G CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CCGCCGCCTC DM2_CNBP chr3 129172576 129172656 CAGG STRchive -chr3 129172576 129172734 CAGG,CAGA,CA TRGT -GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CAGA CAGG CAGC CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGACAGACAGACAGACAGACAGACAGACAGACAGACAGACACACACACACACACACACACACACACACACACACACTGGCAGTAATA -GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CAGA CAGG CA GC CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGA CAGA CAGA CAGA CAGA CAGA CAGA CAGA CAGA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CT GGCAGTAATA +chr3 129172576 129172656 CAGG,CAGA,CA TRGT +GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CAGA CAGG CAGC CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGACAGACA +GTGAGACAGA CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGA CAGG CAGA CAGG CA GC CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGG CAGACAGACA BPES_FOXL2 chr3 138946019 138946062 NGC STRchive @@ -78,15 +78,15 @@ CTACCGGGGC C CGC GGC TGC AGC CGC AGC TGC TGC AGC CGC TGC GGC TGC CGC CATCTGGCAG FAME4_YEATS2 chr3 183712187 183712226 TTTTA,TTTCA STRchive -chr3 183712187 183712226 TTTTA,TTTCA TRGT +chr3 183712187 183712226 TTTCA,TTTTA TRGT TTTTATGTTC TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT TTGAGACAGA TTTTATGTTC TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT TTGAGACAGA HD_HTT chr4 3074876 3074933 CAG STRchive -chr4 3074876 3074969 CAG,CCG TRGT -CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAACAGCCGCCACCGCCGCCGCCGCCGCCGCCGCCTCCTCAGCTTC -CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAA CAG CCG CCA CCG CCG CCG CCG CCG CCG CCG CCT CCTCAGCTTC +chr4 3074876 3074933 CAG,CCG TRGT +CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAACAGCCGC +CAAGTCCTTC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAACAGCCGC CANVAS_RFC1 chr4 39348424 39348483 AAAAG,AAGGG,ACAGG,AGGGC,AAGGC,AGAGG STRchive @@ -102,13 +102,13 @@ CAGGCCTCCA GCT GCC GCC GCT GCC GCT GCC GCC GCC GCC GCT GCC GCG GCC GCC GCC GCT G FAME7_RAPGEF2 chr4 159342526 159342618 TTTTA,TTTCA STRchive -chr4 159342526 159342618 TTTTA,TTTCA TRGT +chr4 159342526 159342618 TTTCA,TTTTA TRGT CTATCATAGC TTTTA TTTTA TTTTA TTTTA TTTTA TATTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TT ACTAGAGGAT CTATCATAGC TTTTA TTTTA TTTTA TTTTA TTTTA TATTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TT ACTAGAGGAT FAME3_MARCHF6 chr5 10356343 10356411 TTTTA,TTTCA STRchive -chr5 10356343 10356411 TTTTA,TTTCA TRGT +chr5 10356343 10356411 TTTCA,TTTTA TRGT CTGTTTTTTA TTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT CCGAGATGGA CTGTTTTTTA TTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTT CCGAGATGGA @@ -168,7 +168,7 @@ ACGACGCCGA CGC CGC CGC CGC CGC CGC CGC CGC CGC CG AGCCACCGGC FAME1_SAMD12 chr8 118366812 118366918 TAAAA,TGAAA STRchive -chr8 118366812 118366918 TAAAA,TGAAA TRGT +chr8 118366812 118366918 TGAAA,TAAAA TRGT ACTCTGTCTC AAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA ATGAACAAAA ACTCTGTCTC AAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA TAAAA ATGAACAAAA @@ -180,9 +180,9 @@ CGCAACCGCA GCCCCGCCCCGGGCCCGCCCCCGGGCCCGCCCCGACCACGCCCC GGCCCC GGCCCC GGCCCC TA FRDA_FXN chr9 69037286 69037304 GAA STRchive -chr9 69037270 69037304 A,GAA TRGT -TAAAAAATACAAAAAAAAAAAAAAAA GAA GAA GAA GAA GAA GAA AATAAAGAAA -TAAAAAATAC A A A A A A A A A A A A A A A A GAA GAA GAA GAA GAA GAA AATAAAGAAA +chr9 69037286 69037304 A,GAA TRGT +AAAAAAAAAA GAA GAA GAA GAA GAA GAA AATAAAGAAA +AAAAAAAAAA GAA GAA GAA GAA GAA GAA AATAAAGAAA HSAN-VIII_PRDM12 chr9 130681605 130681641 GCC STRchive @@ -228,9 +228,9 @@ CTCCCGAGTG GGC GGC GGC GGC GGC GGC GGC GGC GGC GGC AGCGGGGAGG SCA8_ATXN8OS chr13 70139383 70139429 CTG STRchive -chr13 70139353 70139429 CTA,CTG TRGT -CCTGGCTTTACTACTACTACTACTACTACTACTACTACTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG C ATTTTTTAAA -CCTGGCTTTA CTA CTA CTA CTA CTA CTA CTA CTA CTA CTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG C ATTTTTTAAA +chr13 70139383 70139429 CTA,CTG TRGT +ACTACTACTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG C ATTTTTTAAA +ACTACTACTA CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG C ATTTTTTAAA HPE5_ZIC2 chr13 99985448 99985494 GCN STRchive @@ -282,7 +282,7 @@ TCCCGCTCGG GCC GCC GCC GCC GCC CCCCTCCCCA FAME6_TNRC6A chr16 24613438 24613532 TTTTA,TTTCA STRchive -chr16 24613438 24613532 TTTTA,TTTCA TRGT +chr16 24613438 24613532 TTTCA,TTTTA TRGT CTATTAAAGC A TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT ACTTATTTAT CTATTAAAGC A TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT ACTTATTTAT @@ -312,13 +312,13 @@ AAGCCAGGGA G CTG C CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG CTG TAAG FAME8_RAI1 chr17 17808358 17808460 TTTTA,TTTCA STRchive -chr17 17808358 17808460 ATTTT,TTTCA,TTTTA TRGT +chr17 17808358 17808460 TTTCA,TTTTA TRGT +TTATTTTTAA A TTTTA TTTTA TTTTA TA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT CATCTCAGAC TTATTTTTAA A TTTTA TTTTA TTTTA TA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTTTA TTT CATCTCAGAC -TTATTTTTAA ATTTT ATTTT ATTTT AT ATTTT ATTTT ATT ATTTT ATTTT ATTTT ATTTT ATTTT ATT ATTTT ATTTT ATTTT ATTTT ATTTT ATTTT ATTTT ATTTT ATTT CATCTCAGAC RCPS_EIF4A3 chr17 80147009 80147139 CCTCGCTGTGCCGCTGCCGA STRchive -chr17 80147009 80147139 CCTCGCTGCGCCGCTGCCGA,CCTCGCTGTGCCGCTGCCGA TRGT +chr17 80147009 80147139 CCTCGCTGTGCCGCTGCCGA TRGT CCGACCTCGC TGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA GAACAGACGC CCGACCTCGC TGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA CCTCGCTGTGCCGCTGCCGA GAACAGACGC @@ -366,15 +366,15 @@ GTGATCCCCC CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG CAG C SCA36_NOP56 chr20 2652732 2652757 GGCCTG STRchive -chr20 2652732 2652775 GGCCTG,CGCCTG TRGT -GCCGCAGACA G GGCCTG GGCCTG GGCCTG GGCCTG CGCCTGCGCCTGCGCCTGCCCTGGGAAC -GCCGCAGACA G GGCCTG GGCCTG GGCCTG GGCCTG CGCCTG CGCCTG CGCCTG CCCTGGGAAC +chr20 2652732 2652757 GGCCTG,CGCCTG TRGT +GCCGCAGACA G GGCCTG GGCCTG GGCCTG GGCCTG CGCCTGCGCC +GCCGCAGACA G GGCCTG GGCCTG GGCCTG GGCCTG CGCCTGCGCC CJD_PRNP chr20 4699397 4699493 GGTGGTGGCTGGGGGCAGCCTCAT,CCTCATGGTGGTGGCTGGGGGCAG STRchive -chr20 4699370 4699493 CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT TRGT -CCGCTACCCACCTCAGGGCGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA -CCGCTACCCA CCTCAGGGCGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA +chr20 4699397 4699493 CCTCAGGGCGGTGGTGGCTGGGGGCAG,CCTCATGGTGGTGGCTGGGGGCAG,GGTGGTGGCTGGGGGCAGCCTCAT TRGT +CTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA +CTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCTCATGGTGGTGGCTGGGGGCAG CCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA GGAGGTGGCA EPM1_CSTB chr21 43776442 43776479 CGCGGGGCGGGG STRchive @@ -408,9 +408,9 @@ CCGTGGCCGT GGC GGC CGC TGC CGC CGC CGC CGC CGC CGC CGC CGC CGC CGC TGC CGC ACCC DMD_DMD chrX 31284557 31284605 TTC STRchive -chrX 31284557 31284613 TTC,T TRGT -AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTTTTTTTGGCAGAGGTG -AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC T T T T T T T T GGCAGAGGTG +chrX 31284557 31284605 TTC,T TRGT +AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTTTTTTTGG +AACGAACTGT TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTC TTTTTTTTGG SBMA_AR chrX 67545316 67545419 GCA STRchive diff --git a/scripts/environment.yml b/scripts/environment.yml index 5b537bbc..3acd9795 100644 --- a/scripts/environment.yml +++ b/scripts/environment.yml @@ -25,6 +25,8 @@ dependencies: - r-purrr - gfortran - pysam + - htslib + - bedtools - nodejs # build website locally - pyliftover - pip diff --git a/scripts/make-catalog.py b/scripts/make-catalog.py index ad3b3a5f..e25cd89b 100644 --- a/scripts/make-catalog.py +++ b/scripts/make-catalog.py @@ -2,6 +2,7 @@ import sys import re import decimal +import jsbeautifier def chrom_to_int(chrom): """ @@ -57,6 +58,113 @@ def clean_loci(data, genome): return keep_rows +def add_flank_coordinates(row, genome = 'hg38'): + """ + Get the start and stop coordinates of the flanking motif(s) in the locus structure. Only one of the motifs can be missing a count, the rest must have a count. + + example input + + "locus_structure": [ + { + "motif": "CAG", + "count": null, + "type": "main_repeat" + }, + { + "motif": "CAACAG", + "count": 1, + "type": "interruption" + }, + { + "motif": "CCG", + "count": 12, + "type": "flank_repeat" + } + ] + + example output + "locus_structure": [ + { + "motif": "CAG", + "count": null, + "type": "main_repeat" + "position": "chr1:100-200" + }, + { + "motif": "CAACAG", + "count": 1, + "type": "interruption" + "position": "chr1:200-206" + }, + { + "motif": "CCG", + "count": 12, + "type": "flank_repeat" + "position": "chr1:206-242" + } + ] + + :param row: dictionary with STR data for a single locus + :param genome: genome build (hg19, hg38 or T2T) + :return: updated locus_structure with positions for each motif + + >>> add_flank_coordinates({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'locus_structure': [{'motif': 'CAG', 'count': None, 'type': 'main_repeat'}, {'motif': 'CAACAG', 'count': 1, 'type': 'interruption'}, {'motif': 'CCG', 'count': 12, 'type': 'flank_repeat'}]}, genome='hg38') + [{'motif': 'CAG', 'count': None, 'type': 'main_repeat', 'start_hg38': 100, 'stop_hg38': 200}, {'motif': 'CAACAG', 'count': 1, 'type': 'interruption', 'start_hg38': 200, 'stop_hg38': 206}, {'motif': 'CCG', 'count': 12, 'type': 'flank_repeat', 'start_hg38': 206, 'stop_hg38': 242}] + + >>> add_flank_coordinates({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'locus_structure': [{'motif': 'CAG', 'count': 10, 'type': 'flank_repeat'}, {'motif': 'CAACAG', 'count': 1, 'type': 'interruption'}, {'motif': 'CCG', 'count': None, 'type': 'main_repeat'}]}, genome='hg38') + [{'motif': 'CAG', 'count': 10, 'type': 'flank_repeat', 'start_hg38': 64, 'stop_hg38': 94}, {'motif': 'CAACAG', 'count': 1, 'type': 'interruption', 'start_hg38': 94, 'stop_hg38': 100}, {'motif': 'CCG', 'count': None, 'type': 'main_repeat', 'start_hg38': 100, 'stop_hg38': 200}] + + >>> add_flank_coordinates({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'locus_structure': []}, genome='hg38') + [] + """ + if len(row['locus_structure']) == 0: + # No locus structure, return empty list + return [] + + total_length = row['stop_' + genome] - row['start_' + genome] + known_lengths = [] + left_flank = 0 + right_flank = 0 + main_repeat_found = False + for struct_dict in row['locus_structure']: + if struct_dict['type'] == 'main_repeat' and struct_dict['count'] is None: + known_lengths.append(None) + main_repeat_found = True + elif struct_dict['count'] is not None: + known_lengths.append(len(struct_dict['motif']) * struct_dict['count']) + # This assumes than anything without a defined count is in the flanking region. + if main_repeat_found: + # Add to right flank + right_flank += len(struct_dict['motif']) * struct_dict['count'] + else: + # Add to left flank + left_flank += len(struct_dict['motif']) * struct_dict['count'] + else: + # Error case: if a motif is not a main repeat and has no count, it should not be in the locus structure + raise ValueError(f"Motif {struct_dict['motif']} has no count defined. Please check the input data.") + + # Check if more than one unknown length is present + if known_lengths.count(None) > 1: + raise ValueError(f"Multiple unknown lengths found in locus structure for {row['id']}. Please check the input data.") + unknown_length = total_length + left_flank + right_flank - sum([x for x in known_lengths if x is not None]) + #print(f"Total length of locus structure for {row['id']} is {total_length}, known lengths are {known_lengths}, flank lengths are {flank_length}, unknown length is {unknown_length}.") + if unknown_length < 0: + raise ValueError(f"Total length of locus structure for {row['id']} is less than the sum of known lengths. Please check the input data.") + # Replace None value with the unknown length + for i in range(len(known_lengths)): + if known_lengths[i] is None: + known_lengths[i] = unknown_length + + # Calculate start and stop coordinates for each motif in the locus structure + this_start = row['start_' + genome] - left_flank + for i, struct_dict in enumerate(row['locus_structure']): + this_stop = this_start + known_lengths[i] + row['locus_structure'][i]['start_' + genome] = this_start + row['locus_structure'][i]['stop_' + genome] = this_stop + this_start = this_stop + + return row['locus_structure'] + def trgt_catalog(row, genome = 'hg38', struc_type = 'default'): r""" :param row: dictionary with STR data for a single locus @@ -64,42 +172,42 @@ def trgt_catalog(row, genome = 'hg38', struc_type = 'default'): :param struc_type: options: 'motif', 'default' or 'none'. If 'motif', use pathogenic_motif_reference_orientation as locus structure. If 'default', use . If 'none', do not include locus structure. :return: TRGT format catalog string - >>> trgt_catalog({'chrom': 'chr1', 'start_hg38': '100', 'stop_hg38': '200', 'period': '3', 'pathogenic_motif_reference_orientation': ['CAG'], 'gene': 'mygene', 'id': 'myid', 'locus_structure': '', 'flank_motif': '', 'reference_motif_reference_orientation': ['CAG'], 'benign_motif_reference_orientation': [], 'unknown_motif_reference_orientation': []}) + >>> trgt_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'period': 3, 'pathogenic_motif_reference_orientation': ['CAG'], 'gene': 'mygene', 'id': 'myid', 'locus_structure': [], 'reference_motif_reference_orientation': ['CAG'], 'benign_motif_reference_orientation': [], 'unknown_motif_reference_orientation': []}) 'chr1\t100\t200\tID=myid;MOTIFS=CAG;STRUC=' - >>> trgt_catalog({'chrom': 'chr1', 'start_hg38': '100', 'stop_hg38': '200', 'period': '3', 'pathogenic_motif_reference_orientation': ['CAG', 'CCG'], 'gene': 'mygene', 'id': 'myid', 'locus_structure': '', 'flank_motif': '', 'reference_motif_reference_orientation': ['CAG'], 'benign_motif_reference_orientation': [], 'unknown_motif_reference_orientation': []}) + >>> trgt_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'period': 3, 'pathogenic_motif_reference_orientation': ['CAG', 'CCG'], 'gene': 'mygene', 'id': 'myid', 'locus_structure': [], 'reference_motif_reference_orientation': ['CAG'], 'benign_motif_reference_orientation': [], 'unknown_motif_reference_orientation': []}) + 'chr1\t100\t200\tID=myid;MOTIFS=CAG,CCG;STRUC=' + + >>> trgt_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'period': 3, 'pathogenic_motif_reference_orientation': ['CAG', 'CCG'], 'gene': 'mygene', 'id': 'myid', 'locus_structure': [{'motif': 'CAG', 'count': None, 'type': 'main_repeat'}, {'motif': 'CAACAG', 'count': 1, 'type': 'interruption'}, {'motif': 'CCG', 'count': 12, 'type': 'flank_repeat'}], 'reference_motif_reference_orientation': ['CAG'], 'benign_motif_reference_orientation': [], 'unknown_motif_reference_orientation': []}) 'chr1\t100\t200\tID=myid;MOTIFS=CAG,CCG;STRUC=' - >>> trgt_catalog({'chrom': 'chr1', 'start_hg38': '100', 'stop_hg38': '200', 'period': '3', 'pathogenic_motif_reference_orientation': ['CAGG'], 'gene': 'CNBP', 'id': 'DM2_CNBP', 'locus_structure': '(CAGG)*(CAGA)*(CA)*', 'flank_motif': '(CAGG)n(CAGA)10(CA)19', 'reference_motif_reference_orientation': [], 'benign_motif_reference_orientation': [], 'unknown_motif_reference_orientation': []}) - 'chr1\t100\t278\tID=DM2_CNBP;MOTIFS=CAGG,CAGA,CA;STRUC=' + >>> trgt_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'period': 3, 'pathogenic_motif_reference_orientation': ['CAGG'], 'gene': 'CNBP', 'id': 'DM2_CNBP', 'locus_structure': [{'motif': 'CAGG', 'count': None, 'type': 'main_repeat'}, {'motif': 'CAGA', 'count': 10, 'type': 'flank_repeat'}, {'motif': 'CA', 'count': 19, 'type': 'flank_repeat'}], 'reference_motif_reference_orientation': [], 'benign_motif_reference_orientation': [], 'unknown_motif_reference_orientation': []}, struc_type='motif') + 'chr1\t100\t200\tID=DM2_CNBP;MOTIFS=CAGG,CAGA,CA;STRUC=(CAGG)n(CAGA)10(CA)19' + + >>> trgt_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'period': 3, 'pathogenic_motif_reference_orientation': ['CAG', 'CCG'], 'gene': 'mygene', 'id': 'myid', 'locus_structure': [], 'reference_motif_reference_orientation': ['CAG'], 'benign_motif_reference_orientation': [], 'unknown_motif_reference_orientation': []}) + 'chr1\t100\t200\tID=myid;MOTIFS=CAG,CCG;STRUC=' """ - start = int(row['start_' + genome]) - stop = int(row['stop_' + genome]) + + # Add flank coordinates to locus structure + row['locus_structure'] = add_flank_coordinates(row, genome) + + start = row['start_' + genome] + stop = row['stop_' + genome] struc = '' + motifs = [] - if row['flank_motif'] != '' and row['flank_motif'] is not None: - # get motifs in parentheses using regex - flank_motif = row['flank_motif'] - motifs = re.findall(r'\((.*?)\)', flank_motif) - counts = re.findall(r'\)(.*?)[\(|$]', flank_motif.replace('n', 'n(') + '$') - n_found = False - for motif, count in zip(motifs, counts): - struc += f'({motif})n' - if count == 'n': - n_found = True + if len(row['locus_structure']) > 0: + for struct_dict in row['locus_structure']: + if struct_dict['type'] == 'interruption': + # interruptions are not included in the structure continue + motifs.append(struct_dict['motif']) + if struct_dict['count'] is None: + struc += f"({struct_dict['motif']})n" else: - if n_found: - stop += int(count) * len(motif) - else: - start -= int(count) * len(motif) - elif row['locus_structure'] != '' and row['locus_structure'] != None: - locus_structure = row['locus_structure'].strip() - motifs = re.findall(r'\((.*?)\)', locus_structure) - # Substitute * and + with n - struc = locus_structure.replace('*', 'n').replace('+', 'n') - else: - motifs = [] + struc += f"({struct_dict['motif']}){struct_dict['count']}" + + else: # should just do this always? for motif in row['pathogenic_motif_reference_orientation']: motif = motif.strip() # remove leading and trailing whitespace struc += f'({motif})n' @@ -109,7 +217,7 @@ def trgt_catalog(row, genome = 'hg38', struc_type = 'default'): # unique motifs mainitaining order if struc_type == 'motif': - full_struc = f";STRUC{struc}" + full_struc = f";STRUC={struc}" elif struc_type == 'default': # Add all motifs with known function for motif_field in ['pathogenic_motif_reference_orientation', 'reference_motif_reference_orientation', 'benign_motif_reference_orientation']: @@ -126,6 +234,298 @@ def trgt_catalog(row, genome = 'hg38', struc_type = 'default'): return definition +def atarva_catalog(row, genome = 'hg38'): + r""" + :param row: dictionary with STR data for a single locus + :param genome: genome build (hg19, hg38 or T2T) + :return: atarva format catalog string which is a modified BED format with fields: chrom start stop motif motif_len [id] + + Note, compound loci will be split into multiple entries, one for each motif. Overlapping loci are okay. + For loci with multiple pathogenic motifs, only the first motif will be used. Atarva does motif decomposition, so alternate motifs should be detected by the caller. + + >>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'locus_structure': [], 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38') + 'chr1\t100\t200\tCAG\t3\tmyid' + + >>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['AAGGG', 'ACAGG'], 'locus_structure': [], 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38') + 'chr1\t100\t200\tAAGGG\t5\tmyid' + + >>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'locus_structure': [{'motif': 'CAG', 'count': None, 'type': 'main_repeat'}, {'motif': 'CAACAG', 'count': 2, 'type': 'flank_repeat'}, {'motif': 'CCG', 'count': 3, 'type': 'flank_repeat'}], 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38') + 'chr1\t100\t200\tCAG\t3\tmyid\nchr1\t200\t212\tCAACAG\t6\tmyid_flank\nchr1\t212\t221\tCCG\t3\tmyid_flank' + + >>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'locus_structure': [{'motif': 'CAG', 'count': None, 'type': 'main_repeat'}, {'motif': 'CAACAG', 'count': 6, 'type': 'flank_repeat'}, {'motif': 'CCG', 'count': 3, 'type': 'flank_repeat'}], 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38') + 'chr1\t100\t200\tCAG\t3\tmyid\nchr1\t200\t236\tCAACAG\t6\tmyid_flank\nchr1\t236\t245\tCCG\t3\tmyid_flank' + """ + + # Add flank coordinates to locus structure + row['locus_structure'] = add_flank_coordinates(row, genome) + + bed_string = '' + + motif_field = 'pathogenic_motif_reference_orientation' + id_field = 'id' + start = int(row['start_' + genome]) + stop = int(row['stop_' + genome]) + + motifs = row[motif_field] + this_id = row[id_field] + + # check for flanking motif(s) + if len(row['locus_structure']) > 0: + for struct_dict in row['locus_structure']: + motif = struct_dict['motif'] + motif_len = len(motif) + start = struct_dict['start_' + genome] + stop = struct_dict['stop_' + genome] + if struct_dict['type'] == 'main_repeat': + # this is the main repeat + bed_string += f"{row['chrom']}\t{start}\t{stop}\t{motif}\t{motif_len}\t{this_id}\n" + elif struct_dict['type'] == 'interruption': + # interruptions are not included in the structure + continue + else: + # this is a flank repeat + bed_string += f"{row['chrom']}\t{start}\t{stop}\t{motif}\t{motif_len}\t{this_id}_flank\n" + + else: + motif = motifs[0] # use first motif only + motif_len = len(motif) + bed_string += f"{row['chrom']}\t{start}\t{stop}\t{motif}\t{motif_len}\t{this_id}\n" + + return bed_string.rstrip('\n') + +def longtr_catalog(row, genome = 'hg38'): + r""" + :param row: dictionary with STR data for a single locus + :param genome: genome build (hg19, hg38 or T2T) + :return: LongTR format catalog string + + Note, LongTR uses 1-based coordinates (i.e. is non-standard BED format) + """ + start = int(row['start_' + genome]) + 1 + stop = int(row['stop_' + genome]) + motifs = row['pathogenic_motif_reference_orientation'] + row['benign_motif_reference_orientation'] + row['reference_motif_reference_orientation'] + # remove duplicates + motifs = list(dict.fromkeys([x.strip() for x in motifs])) + motifs_string = ','.join(motifs) + + definition = f"{row['chrom']}\t{start}\t{stop}\t{motifs_string}\t{row['id']}" + + return definition + +def expansionhunter_catalog(row, genome): + """ + See format description at https://github.com/Illumina/ExpansionHunter/blob/master/docs/04_VariantCatalogFiles.md + + Example from gnomAD: + { + "LocusId": "ABCD3", + "ReferenceRegion": "chr1:94418421-94418442", + "LocusStructure": "(GCC)*", + "VariantType": "Repeat", + "RepeatUnit": "GCC", + "Gene": "ABCD3", + "GeneRegion": "5'-UTR", + "GeneId": "ENSG00000117528", + "DiscoveryMethod": "WGS", + "DiscoveryYear": 2023, + "Diseases": [ + { + "Symbol": "OPDM", + "Name": "Oculopharyngodistal myopathy", + "Inheritance": "AD", + "NormalMax": 44, + "PathogenicMin": 118 + } + ], + "MainReferenceRegion": "chr1:94418421-94418442", + "Inheritance": "AD" + } + + Simple example from ExpansionHunter: + { + "LocusId": "DMPK", + "LocusStructure": "(CAG)*", + "ReferenceRegion": "19:46273462-46273522", # 0-based coordinates + "VariantType": "Repeat" + }, + { + "LocusId": "HTT", + "LocusStructure": "(CAG)*CAACAG(CCG)*", + "ReferenceRegion": ["4:3076604-3076660", "4:3076666-3076693"], + "VariantType": ["Repeat", "Repeat"] + } + """ + raise NotImplementedError + + # Optional fields used by gnomAD: + # locus_dict['MainReferenceRegion'] = f"{row['chrom']}:{row['start_' + genome]}-{row['stop_' + genome]}" + # locus_dict['Inheritance'] = row['inheritance'] + # locus_dict['Gene'] = row['gene'] + # locus_dict['GeneRegion'] = row['type'] + # locus_dict['DiscoveryYear'] = row['year'] + # locus_dict['Diseases'] = [{ + # 'Symbol': row['disease_id'], + # 'Name': row['disease'], + # 'Inheritance': row['inheritance'], + # 'NormalMax': row['benign_max'], + # 'PathogenicMin': row['pathogenic_min'] + # }] + +def stranger_catalog(row, genome = 'hg38'): + r""" + :param row: dictionary with STR data for a single locus + :param genome: genome build (hg19, hg38 or T2T) + :return: STRanger format catalog dictionary for a single locus + + Note, the stranger catalog is similar to the ExpansionHunter catalog and in some cases they are used for both purposes. + However, the STRanger catalog is for annotation, and in this case is designed to be used with long-read genotype data. + It is critical that the start coordinates of the stranger catalog match the start coordinates of the catalog used for genotyping. + For this reason, no flanking coordinates are added to the locus structure. + + Example from STRanger: + { + "LocusId": "ABCD3", + "HGNCId": 67, # Currently not implemented + "InheritanceMode": "AD", + "DisplayRU": "CCG", + "LocusStructure": "(CCG)*", + "ReferenceRegion": "1:94418422-94418444", + "VariantType": "Repeat", + "Disease": "OPDM", + "NormalMax": 50, + "PathologicMin": 118 + }, + { + "LocusId": "FXN", + "HGNCId": 3951, + "InheritanceMode": "AR", + "DisplayRU": "GAA", + "SourceDisplay": "GeneReviews Internet 2019-11-07", + "Source": "GeneReviews", + "SourceId": "NBK535148", + "LocusStructure": "(A)*(GAA)*", + "ReferenceRegion": "chr9:69037286-69037304", + "VariantId": "FXN", + "VariantType": "Repeat", + "Disease": "FRDA", + "NormalMax": 35, + "PathologicMin": 51, + "PathologicRegion": "chr9:69037286-69037304" + }, + + >>> stranger_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'gene': 'mygene', 'id': 'myid', 'locus_structure': [], 'benign_max': 5, 'pathogenic_min': 10, 'inheritance': 'AD', 'disease_id': 'disease_id', 'disease': 'Disease Name', 'year': 2023}, genome='hg38') + {'LocusId': 'myid', 'ReferenceRegion': 'chr1:100-200', 'LocusStructure': '(CAG)*', 'VariantType': 'Repeat', 'HGNCId': None, 'InheritanceMode': 'AD', 'DisplayRU': 'CAG', 'Disease': 'disease_id', 'NormalMax': 5, 'PathologicMin': 10, 'Gene': 'mygene'} + + >>> stranger_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'gene': 'mygene', 'id': 'myid', 'locus_structure': [{'motif': 'CAG', 'count': None, 'type': 'main_repeat'}, {'motif': 'CAACAG', 'count': 1, 'type': 'interruption'}, {'motif': 'CCG', 'count': 3, 'type': 'flank_repeat'}], 'benign_max': 5, 'pathogenic_min': 10, 'inheritance': 'AD', 'disease_id': 'disease_id', 'disease': 'Disease Name', 'year': 2023}, genome='hg38') + {'LocusId': 'myid', 'ReferenceRegion': ['chr1:100-200', 'chr1:206-215'], 'LocusStructure': '(CAG)*CAACAG(CCG)*', 'VariantType': ['Repeat', 'Repeat'], 'VariantId': ['myid', 'myid_CCG'], 'PathologicRegion': 'chr1:100-200', 'HGNCId': None, 'InheritanceMode': 'AD', 'DisplayRU': 'CAG', 'Disease': 'disease_id', 'NormalMax': 5, 'PathologicMin': 10, 'Gene': 'mygene'} + """ + + row['locus_structure'] = add_flank_coordinates(row, genome) + + locus_dict = {} + + # Required/standard fields from ExpansionHunter: + locus_dict['LocusId'] = row['id'] + locus_dict['ReferenceRegion'] = [] + if len(row['locus_structure']) > 0: + locus_dict['LocusStructure'] = '' + locus_dict['VariantType'] = [] + locus_dict['VariantId'] = [] # used to store the ID of the variant, e.g. myid_CCG for the CCG motif in the locus structure + for struct_dict in row['locus_structure']: + + if struct_dict['type'] == 'interruption': + locus_dict['LocusStructure'] += f"{struct_dict['motif']*struct_dict['count']}" # interruptions are included in the structure but not in the variant list + else: + locus_dict['ReferenceRegion'].append(f"{row['chrom']}:{struct_dict['start_' + genome]}-{struct_dict['stop_' + genome]}") # 0-based coordinates + locus_dict['LocusStructure'] += f"({struct_dict['motif']})*" + locus_dict['VariantType'].append('Repeat') + if struct_dict['type'] == 'main_repeat': + locus_dict['VariantId'].append(row['id']) + locus_dict['PathologicRegion'] = f"{row['chrom']}:{struct_dict['start_' + genome]}-{struct_dict['stop_' + genome]}" + else: + locus_dict['VariantId'].append(f"{row['id']}_{struct_dict['motif']}") + else: + locus_dict['ReferenceRegion'] = f"{row['chrom']}:{row['start_' + genome]}-{row['stop_' + genome]}" # 0-based coordinates + locus_dict['LocusStructure'] = f"({row['pathogenic_motif_reference_orientation'][0]})*" # Just use the first pathogenic motif for the structure + locus_dict['VariantType'] = 'Repeat' + + # Fields used by STRanger: + locus_dict['HGNCId'] = None # Currently not implemented, would require mapping of gene names to HGNC IDs + locus_dict['InheritanceMode'] = row['inheritance'] + locus_dict['DisplayRU'] = row['pathogenic_motif_reference_orientation'][0] # use first pathogenic motif for display + locus_dict['Disease'] = row['disease_id'] # disease ID + + + # Do some special handling for missing values and unusual cases: + # If both pathogenic min and benign max are missing, skip the locus + if row['pathogenic_min'] is None and row['benign_max'] is None: + return None + + # If normal max is missing, set it to pathogenic min - 1 + if row['benign_max'] is None: + locus_dict['NormalMax'] = row['pathogenic_min'] - 1 + else: + locus_dict['NormalMax'] = row['benign_max'] + + # If pathogenic min is <= benign max or missing, set pathologic min to benign max + 1 + if row['pathogenic_min'] is None or row['pathogenic_min'] <= locus_dict['NormalMax']: + locus_dict['PathologicMin'] = locus_dict['NormalMax'] + 1 + else: + locus_dict['PathologicMin'] = row['pathogenic_min'] + + # Optional extra fields: + locus_dict['Gene'] = row['gene'] + + return locus_dict + +def straglr_catalog(row, genome = 'hg38', format = 'default'): + r""" + :param row: dictionary with STR data for a single locus + :param genome: genome build (hg19, hg38 or T2T) + :param format: options: 'default' or 'wf-human-variation'. If 'default', use chrom, start, stop, motif. If 'wf-human-variation', use chrom, start, stop, motif, gene, id. + :return: straglr format catalog string which is a modified BED format with fields: chrom start stop motif [gene id] + + Example format: + https://github.com/epi2me-labs/wf-human-variation/blob/master/data/wf_str_repeats.bed + + chr1 149390802 149390841 GGC NOTCH2NLC NOTCH2NLC + chr2 190880872 190880920 GCA GLS GLS + chr3 63912684 63912714 GCA ATXN7 ATXN7 + + >>> straglr_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'locus_structure': [{'motif': 'CAG', 'count': None, 'type': 'main_repeat'}, {'motif': 'CCG', 'count': 10, 'type': 'flank_repeat'}, {'motif': 'CAA', 'count': 10, 'type': 'flank_repeat'}], 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38') + 'chr1\t100\t200\tCAG\nchr1\t200\t230\tCCG\nchr1\t230\t260\tCAA' + + >>> straglr_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'locus_structure': [{'motif': 'CAG', 'count': None, 'type': 'main_repeat'}, {'motif': 'CCG', 'count': 10, 'type': 'flank_repeat'}, {'motif': 'CAA', 'count': 10, 'type': 'flank_repeat'}], 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38') + 'chr1\t100\t200\tCAG\nchr1\t200\t230\tCCG\nchr1\t230\t260\tCAA' + + >>> straglr_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'locus_structure': [{'motif': 'CAG', 'count': None, 'type': 'main_repeat'}, {'motif': 'CCG', 'count': 10, 'type': 'flank_repeat'}, {'motif': 'CAA', 'count': 10, 'type': 'flank_repeat'}], 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38', format='wf-human-variation') + 'chr1\t100\t200\tCAG\tmyid\tmyid\nchr1\t200\t230\tCCG\tmyid\tmyid_CCG\nchr1\t230\t260\tCAA\tmyid\tmyid_CAA' + """ + + # Do some special handling for missing values and unusual cases: + # If both pathogenic min and benign max are missing, skip the locus + if row['pathogenic_min'] is None and row['benign_max'] is None: + return None + + bed_list = [] + # Use the same approach as atarva_catalog, but only return the first 4 columns (chrom, start, stop) + atarva_list = [x.split('\t') for x in atarva_catalog(row, genome).split('\n')] + if format == 'default': + for bed_row in atarva_list: + bed_list.append('\t'.join(bed_row[0:4])) + elif format == 'wf-human-variation': + for bed_row in atarva_list: + motif = bed_row[3] + # replace the word "flank" with the motif name in the id + bed_row[5] = bed_row[5].replace('_flank', f'_{motif}') + bed_list.append('\t'.join(bed_row[0:4] + [row['id'], bed_row[5]])) + else: + raise ValueError(f'Unknown format: {format}. Expected default or wf-human-variation.') + + bed_string = '\n'.join(bed_list) + + return bed_string + def extended_bed(row, fields = [], genome = 'hg38'): r""" :param row: dictionary with STR data for a single locus @@ -162,7 +562,7 @@ def main(input: str, output: str, *, format: str = 'TRGT', genome: str = 'hg38', :param input: STRchive database file name in JSON format :param output: Output file name in bed format :param genome: Genome build: hg19, hg38, T2T (also accepted: chm13, chm13-T2T, T2T-CHM13) - :param format: Variant caller catalog file format BED format (TRGT or BED) + :param format: Variant caller catalog file format or BED format (TRGT, atarva, LongTR, straglr, stranger, ExpansionHunter or BED) :param cols: Comma separated list of columns to include in the extended BED format beyond chrom,start,stop (no spaces in list). Can be any valid STRchive json field. """ @@ -191,6 +591,50 @@ def main(input: str, output: str, *, format: str = 'TRGT', genome: str = 'hg38', with open(output, 'w') as out_file: for row in data: out_file.write(trgt_catalog(row, genome) + '\n') + elif format.lower() == 'atarva': + with open(output, 'w') as out_file: + header = '#' + '\t'.join(['chrom', 'start', 'stop', 'motif', 'motif_len', 'id']) + '\n' + out_file.write(header) + for row in data: + out_file.write(atarva_catalog(row, genome) + '\n') + elif format.lower() == 'longtr': + with open(output, 'w') as out_file: + for row in data: + out_file.write(longtr_catalog(row, genome) + '\n') + elif format.lower() == 'expansionhunter': + eh_loci = [] + for row in data: + locus = expansionhunter_catalog(row, genome) + eh_loci.append(locus) + # Write the catalog as a JSON array + output = output if output.endswith('.json') else output + '.json' + with open(output, 'w') as out_json_file: + options = jsbeautifier.default_options() + options.indent_size = 2 + options.brace_style="expand" + out_json_file.write(jsbeautifier.beautify(json.dumps(eh_loci, ensure_ascii=False), options)) + out_json_file.write('\n') + elif format.lower() == 'stranger': + stranger_loci = [] + for row in data: + locus = stranger_catalog(row, genome) + if locus is not None: + stranger_loci.append(locus) + # Write the catalog as a JSON array + output = output if output.endswith('.json') else output + '.json' + with open(output, 'w') as out_json_file: + options = jsbeautifier.default_options() + options.indent_size = 2 + options.brace_style="expand" + out_json_file.write(jsbeautifier.beautify(json.dumps(stranger_loci, ensure_ascii=False), options)) + out_json_file.write('\n') + elif format.lower() == 'straglr': + with open(output, 'w') as out_file: + # No header for straglr format + for row in data: + straglr_string = straglr_catalog(row, genome, format = 'wf-human-variation') + if straglr_string is not None: # Check if the string is not None + out_file.write(straglr_string + '\n') elif format.lower() == 'bed': fields_list = fields.split(',') header = '#' + '\t'.join(['chrom', 'start', 'stop'] + fields_list) + '\n' @@ -199,7 +643,7 @@ def main(input: str, output: str, *, format: str = 'TRGT', genome: str = 'hg38', for row in data: out_file.write(extended_bed(row, fields_list, genome) + '\n') else: - raise ValueError('Unknown output file format. Expected TRGT or BED.') + raise ValueError('Unknown output file format. Expected TRGT, atarva, straglr or BED.') if __name__ == "__main__": import doctest diff --git a/scripts/ref-allele.py b/scripts/ref-allele.py index fb03a210..28787a94 100644 --- a/scripts/ref-allele.py +++ b/scripts/ref-allele.py @@ -64,13 +64,20 @@ def read_bed(bed, format): continue line = line.strip().split('\t') - if format == 'strchive': + if format.lower() == 'strchive': motifs = list(dict.fromkeys([x.strip().upper() for x in line[ref_motif_index].split(',')] + [x.strip().upper() for x in line[path_motif_index].split(',')])) locusid = line[locusid_index] - elif format == 'pacbio': + elif format.lower() == 'pacbio' or format.lower() == 'trgt': annotations = line[3].split(';') motifs = [x.strip().upper() for x in annotations[1].split('=')[1].split(',')] locusid = annotations[0].split('=')[1] + elif format.lower() == 'atarva': + # motifs are in the 4th column, locusid in the 6th column + motifs = [x.strip().upper() for x in line[3].split(',')] + locusid = line[5] + + else: + raise ValueError(f"Unknown format: {format}. Supported formats are 'strchive', 'pacbio', 'trgt', and 'atarva'.") locus = { 'chrom': line[0], @@ -125,13 +132,94 @@ def get_ref(fasta, ref_directory='.'): # If the file is not found, raise an error raise FileNotFoundError(f"Reference genome file not found and couldn't be downloaded: {fasta}") -def main(bed1: str, bed2: str, fasta: str, out: str, storage: str = '.', flank: int = 10): +def compare_beds(bed1, bed2, name1, name2, ref, flank=10): + for bed1_info, bed2_info in zip(read_bed(bed1, name1), read_bed(bed2, name2)): + + if bed1_info['id'] != bed2_info['id']: + raise ValueError(f"IDs do not match: {bed1_info['id']} != {bed2_info['id']}") + + # choose the start of the left flank based on the smallest start coordinate + lflank_start = bed1_info['start'] - flank if bed1_info['start'] < bed2_info['start'] else bed2_info['start'] - flank + + # choose the end of the right flank based on the largest end coordinate + rflank_end = bed1_info['end'] + flank if bed1_info['end'] > bed2_info['end'] else bed2_info['end'] + flank + + # fetching the sequences + # NOTE: the flank sequences are extracted from the start and end of bed1_info locus + lflank = ref.fetch(bed1_info['chrom'], lflank_start, bed1_info['start']).upper() + rflank = ref.fetch(bed1_info['chrom'], bed1_info['end'], rflank_end).upper() + bed1_info_seq = ref.fetch(bed1_info['chrom'], bed1_info['start'], bed1_info['end']).upper() + bed2_info_seq = "" + + # flank sequence initialisers + bed1_info_lflank = ""; bed1_info_rflank = "" + bed2_info_lflank = ""; bed2_info_rflank = "" + + if bed1_info['start'] > bed2_info['start']: + # if bed1_info start is greater than bed2_info start + diff = bed1_info['start'] - bed2_info['start'] + # pull difference bases into the repeat for bed2_info and add gaps in the flank + bed2_info_seq = lflank[-diff:] + bed1_info_seq + bed2_info_lflank = lflank[:-diff] + " "*diff + # add difference gaps in the repeat for bed2_info and unchanged flank + bed1_info_seq = " "*diff + bed1_info_seq + bed1_info_lflank = lflank + + elif bed1_info['start'] < bed2_info['start']: + # if bed2_info start is greater than bed1_info start + diff = bed2_info['start'] - bed1_info['start'] + # add difference gaps to bed2_info repeat sequence and different bases to flank + bed2_info_seq = " "*diff + bed1_info_seq[diff:] + bed2_info_lflank = lflank + bed1_info_seq[:diff] + # add difference gaps to bed1_info flank + bed1_info_lflank = lflank[:-diff] + " "*diff + else: + # start coordinates are same + bed2_info_lflank = lflank + bed1_info_lflank = lflank + bed2_info_seq = bed1_info_seq + + if bed1_info['end'] < bed2_info['end']: + # if bed1_info end is smaller than bed2_info end + diff = bed2_info['end'] - bed1_info['end'] + # add the difference based from flank to bed2_info repeat and gaps in flank + bed2_info_seq = bed2_info_seq + rflank[:diff] + bed2_info_rflank = " "*diff + rflank[diff:] + # add the difference gaps bed1_info repeat and unchanged flank + bed1_info_seq += " "*diff + bed1_info_rflank = rflank + + elif bed2_info['end'] < bed1_info['end']: + # if bed2_info end is smalled than bed1_info end + diff = bed1_info['end'] - bed2_info['end'] + # remove difference bases from bed2_info repeat and add gaps and add difference bases to the flank + bed2_info_seq = bed2_info_seq[:-diff] + " "*diff + bed2_info_rflank = bed2_info_seq[-diff:] + rflank + # unchanged bed1_info repeat sequence and add difference gaps to bed1_info flank + bed1_info_rflank = " "*diff + rflank + else: + # if the end coordinates are the same + bed1_info_rflank = rflank + bed2_info_rflank = rflank + + bed1_info_seq = split_repeat_sequence(bed1_info['motifs'], bed1_info_seq) + bed2_info_seq = split_repeat_sequence(bed2_info['motifs'], bed2_info_seq) + + yield f"{bed1_info['id']}\n" + yield f"{bed1_info['chrom']}\t{bed1_info['start']}\t{bed1_info['end']}\t{','.join(bed1_info['motifs'])}\t{name1}\n" + yield f"{bed2_info['chrom']}\t{bed2_info['start']}\t{bed2_info['end']}\t{','.join(bed2_info['motifs'])}\t{name2}\n" + yield f"{bed1_info_lflank}\t{bed1_info_seq}\t{bed1_info_rflank}\n" + yield f"{bed2_info_lflank}\t{bed2_info_seq}\t{bed2_info_rflank}\n" + yield "\n" + +def main(beds: list[str], names: list[str], fasta: str, out: str, storage: str = '.', flank: int = 10): """ Extracts the flanking sequences and the repeat sequence for tandem repeat loci from the reference genome - :param bed1: Input bed file containing the tandem repeat loci from STRchive - :param bed2: Input bed file containing the tandem repeat loci from PacBio - :param fasta: Reference genome fasta file + # Accept one or more bedfiles and a corresponding list of names. + :param list[str] beds: List of input bed files containing the tandem repeat loci + :param list[str] names: List of names corresponding to each bed file + :param fasta: Reference genome fasta file (required) :param output: Output file name for the extracted sequences :param storage: Reference genome storage directory (default: current directory) :param flank: Flanking sequence length to extract (default: 10) @@ -141,84 +229,10 @@ def main(bed1: str, bed2: str, fasta: str, out: str, storage: str = '.', flank: # Assuming the bed files are sorted and contain the same loci with open(out, 'w') as outfh: - for strchive, pacbio in zip(read_bed(bed1, 'strchive'), read_bed(bed2, 'pacbio')): - if strchive['id'] != pacbio['id']: - raise ValueError(f"IDs do not match: {strchive['id']} != {pacbio['id']}") + + for line in compare_beds(beds[0], beds[1], names[0], names[1], ref, flank): + outfh.write(line) - # choose the start of the left flank based on the smallest start coordinate - lflank_start = strchive['start'] - flank if strchive['start'] < pacbio['start'] else pacbio['start'] - flank - - # choose the end of the right flank based on the largest end coordinate - rflank_end = strchive['end'] + flank if strchive['end'] > pacbio['end'] else pacbio['end'] + flank - - # fetching the sequences - # NOTE: the flank sequences are extracted from the start and end of strchive locus - lflank = ref.fetch(strchive['chrom'], lflank_start, strchive['start']).upper() - rflank = ref.fetch(strchive['chrom'], strchive['end'], rflank_end).upper() - strchive_seq = ref.fetch(strchive['chrom'], strchive['start'], strchive['end']).upper() - pacbio_seq = "" - - # flank sequence initialisers - strchive_lflank = ""; strchive_rflank = "" - pacbio_lflank = ""; pacbio_rflank = "" - - if strchive['start'] > pacbio['start']: - # if strchive start is greater than pacbio start - diff = strchive['start'] - pacbio['start'] - # pull difference bases into the repeat for pacbio and add gaps in the flank - pacbio_seq = lflank[-diff:] + strchive_seq - pacbio_lflank = lflank[:-diff] + " "*diff - # add difference gaps in the repeat for pacbio and unchanged flank - strchive_seq = " "*diff + strchive_seq - strchive_lflank = lflank - - elif strchive['start'] < pacbio['start']: - # if pacbio start is greater than strchive start - diff = pacbio['start'] - strchive['start'] - # add difference gaps to pacbio repeat sequence and different bases to flank - pacbio_seq = " "*diff + strchive_seq[diff:] - pacbio_lflank = lflank + strchive_seq[:diff] - # add difference gaps to strchive flank - strchive_lflank = lflank[:-diff] + " "*diff - else: - # start coordinates are same - pacbio_lflank = lflank - strchive_lflank = lflank - pacbio_seq = strchive_seq - - if strchive['end'] < pacbio['end']: - # if strchive end is smaller than pacbio end - diff = pacbio['end'] - strchive['end'] - # add the difference based from flank to pacbio repeat and gaps in flank - pacbio_seq = pacbio_seq + rflank[:diff] - pacbio_rflank = " "*diff + rflank[diff:] - # add the difference gaps strchive repeat and unchanged flank - strchive_seq += " "*diff - strchive_rflank = rflank - - elif pacbio['end'] < strchive['end']: - # if pacbio end is smalled than strchive end - diff = strchive['end'] - pacbio['end'] - # remove difference bases from pacbio repeat and add gaps and add difference bases to the flank - pacbio_seq = pacbio_seq[:-diff] + " "*diff - pacbio_rflank = pacbio_seq[-diff:] + rflank - # unchanged strchive repeat sequence and add difference gaps to strchive flank - strchive_rflank = " "*diff + rflank - else: - # if the end coordinates are the same - strchive_rflank = rflank - pacbio_rflank = rflank - - - strchive_seq = split_repeat_sequence(strchive['motifs'], strchive_seq) - pacbio_seq = split_repeat_sequence(pacbio['motifs'], pacbio_seq) - - outfh.write(f"{strchive['id']}\n") - outfh.write(f"{strchive['chrom']}\t{strchive['start']}\t{strchive['end']}\t{','.join(strchive['motifs'])}\tSTRchive\n") - outfh.write(f"{pacbio['chrom']}\t{pacbio['start']}\t{pacbio['end']}\t{','.join(pacbio['motifs'])}\tTRGT\n") - outfh.write(f"{strchive_lflank}\t{strchive_seq}\t{strchive_rflank}\n") - outfh.write(f"{pacbio_lflank}\t{pacbio_seq}\t{pacbio_rflank}\n") - outfh.write("\n") ref.close() @@ -226,4 +240,4 @@ def main(bed1: str, bed2: str, fasta: str, out: str, storage: str = '.', flank: import doctest doctest.testmod() import defopt - defopt.run(main) \ No newline at end of file + defopt.run(main, cli_options='all') \ No newline at end of file diff --git a/scripts/setup-miniconda-patched-environment.yml b/scripts/setup-miniconda-patched-environment.yml index a3e128ff..892c8d73 100644 --- a/scripts/setup-miniconda-patched-environment.yml +++ b/scripts/setup-miniconda-patched-environment.yml @@ -23,6 +23,8 @@ dependencies: - r-purrr - gfortran - pysam + - htslib + - bedtools - nodejs - pyliftover - pip diff --git a/site/src/components/Button.module.css b/site/src/components/Button.module.css index 27fdea7f..c6c10dc6 100644 --- a/site/src/components/Button.module.css +++ b/site/src/components/Button.module.css @@ -7,6 +7,7 @@ padding: 7.5px; gap: 10px; border-radius: var(--rounded); + text-decoration: none; } .button:has(span) { @@ -28,7 +29,6 @@ border-radius: 999px; background: none; color: currentColor; - text-decoration: none; } .bubble:hover { diff --git a/site/src/components/Header.astro b/site/src/components/Header.astro index 460d9c58..91a48da6 100644 --- a/site/src/components/Header.astro +++ b/site/src/components/Header.astro @@ -28,7 +28,7 @@ const links = [ {version}{version} diff --git a/site/src/components/Link.jsx b/site/src/components/Link.jsx index cef257a1..ae20ca83 100644 --- a/site/src/components/Link.jsx +++ b/site/src/components/Link.jsx @@ -4,7 +4,7 @@ import classes from "./Link.module.css"; const Link = ({ to, newTab = undefined, - showArrow = undefined, + arrow = undefined, children, ...props }) => { @@ -20,9 +20,7 @@ const Link = ({ > {children} {/* indicate third-party site with icon */} - {(showArrow ?? external) && ( - - )} + {(arrow ?? external) && } ); }; diff --git a/site/src/components/Table.jsx b/site/src/components/Table.jsx index 6c7fcbf5..aff50789 100644 --- a/site/src/components/Table.jsx +++ b/site/src/components/Table.jsx @@ -94,7 +94,7 @@ const Table = ({ cols, rows, sort = undefined, showControls = true }) => { return (
-
+
{/* table */} file.includes(locus.gene)); /** get blame info of locus */ @@ -119,7 +119,7 @@ const blame = getJsonBlame(
{identifiers.map(({ name, links, tooltip, info }) => ( <> - + {name}
+ + + + {cols.map((col) => )} + + + + + { + rows.map((row) => ( + + + {cols.map((col) => ( + + ))} + + )) + } + +
{col}
+ {row.link ? ( + + {row.primary} + + ) : ( +
+ {row.primary} +
+ )} + +
{row.secondary}
+
+
+ {filter( + catalogs, + (catalog) => + catalog.software == row.key && catalog.genome === col, + ).map((download) => ( + + ))} +
+
@@ -132,17 +190,15 @@ const bedDownloads = downloads.slice(3); diff --git a/site/src/util/dom.js b/site/src/util/dom.js index 0a3b1961..41375c6e 100644 --- a/site/src/util/dom.js +++ b/site/src/util/dom.js @@ -1,4 +1,4 @@ -import { debounce, now } from "lodash-es"; +import { debounce } from "lodash-es"; import { sleep } from "@/util/misc"; /** scroll page so that mouse stays at same position in document */ diff --git a/workflow/Snakefile b/workflow/Snakefile index c8cb2a79..6f512c1a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -38,13 +38,29 @@ if stages == "all" or stages == "new-refs" or stages == "old-refs": # Citations expand("{base_dir}STRchive-citations.json", base_dir = base_dir), # TRGT bed files - expand("{base_dir}STRchive-disease-loci.hg38.TRGT.bed", base_dir = base_dir), - expand("{base_dir}STRchive-disease-loci.hg19.TRGT.bed", base_dir = base_dir), - expand("{base_dir}STRchive-disease-loci.T2T-chm13.TRGT.bed", base_dir = base_dir), - # Extended BED files - expand("{base_dir}STRchive-disease-loci.hg38.bed", base_dir = base_dir), - expand("{base_dir}STRchive-disease-loci.hg19.bed", base_dir = base_dir), - expand("{base_dir}STRchive-disease-loci.T2T-chm13.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.TRGT.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.TRGT.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.TRGT.bed", base_dir = base_dir), + # Atarva bed files + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.atarva.bed.gz", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.atarva.bed.gz", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed.gz", base_dir = base_dir), + # stranger catalogs + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.stranger.json", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.stranger.json", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.stranger.json", base_dir = base_dir), + # Straglr bed files + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.straglr.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.straglr.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.straglr.bed", base_dir = base_dir), + # General BED files + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.general.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.general.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.general.bed", base_dir = base_dir), + # longTR catalog BED files + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.longTR.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.longTR.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.longTR.bed", base_dir = base_dir), # Plots # expand("{base_dir}plots/gnomad.json", base_dir = base_dir), expand("{base_dir}plots/age-onset.json", base_dir = base_dir), @@ -59,13 +75,29 @@ elif stages == "skip-refs": # Check loci expand("{base_dir}check-loci.txt", base_dir = base_dir), # TRGT bed files - expand("{base_dir}STRchive-disease-loci.hg38.TRGT.bed", base_dir = base_dir), - expand("{base_dir}STRchive-disease-loci.hg19.TRGT.bed", base_dir = base_dir), - expand("{base_dir}STRchive-disease-loci.T2T-chm13.TRGT.bed", base_dir = base_dir), - # Extended BED files - expand("{base_dir}STRchive-disease-loci.hg38.bed", base_dir = base_dir), - expand("{base_dir}STRchive-disease-loci.hg19.bed", base_dir = base_dir), - expand("{base_dir}STRchive-disease-loci.T2T-chm13.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.TRGT.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.TRGT.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.TRGT.bed", base_dir = base_dir), + # Atarva bed files + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.atarva.bed.gz", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.atarva.bed.gz", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed.gz", base_dir = base_dir), + # stranger catalogs + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.stranger.json", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.stranger.json", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.stranger.json", base_dir = base_dir), + # Straglr bed files + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.straglr.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.straglr.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.straglr.bed", base_dir = base_dir), + # General BED files + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.general.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.general.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.general.bed", base_dir = base_dir), + # longTR catalog BED files + expand("{base_dir}catalogs/STRchive-disease-loci.hg38.longTR.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.hg19.longTR.bed", base_dir = base_dir), + expand("{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.longTR.bed", base_dir = base_dir), # Plots # expand("{base_dir}plots/gnomad.json", base_dir = base_dir), expand("{base_dir}plots/age-onset.json", base_dir = base_dir), @@ -169,7 +201,7 @@ rule TRGT_hg38: in_json = in_json, check = "{base_dir}check-loci.txt" output: - results = "{base_dir}STRchive-disease-loci.hg38.TRGT.bed" + results = "{base_dir}catalogs/STRchive-disease-loci.hg38.TRGT.bed" shell: """ python {scripts_dir}make-catalog.py -f TRGT -g hg38 {input.in_json} {output.results} @@ -180,7 +212,7 @@ rule TRGT_hg19: in_json = in_json, check = "{base_dir}check-loci.txt" output: - results = "{base_dir}STRchive-disease-loci.hg19.TRGT.bed" + results = "{base_dir}catalogs/STRchive-disease-loci.hg19.TRGT.bed" shell: """ python {scripts_dir}make-catalog.py -f TRGT -g hg19 {input.in_json} {output.results} @@ -191,18 +223,129 @@ rule TRGT_T2T: in_json = in_json, check = "{base_dir}check-loci.txt" output: - results = "{base_dir}STRchive-disease-loci.T2T-chm13.TRGT.bed" + results = "{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.TRGT.bed" shell: """ python {scripts_dir}make-catalog.py -f TRGT -g T2T {input.in_json} {output.results} """ +rule atarva_hg38: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + bed = "{base_dir}catalogs/STRchive-disease-loci.hg38.atarva.bed", + bed_gz = "{base_dir}catalogs/STRchive-disease-loci.hg38.atarva.bed.gz", + tbi = "{base_dir}catalogs/STRchive-disease-loci.hg38.atarva.bed.gz.tbi" + shell: + """ + python {scripts_dir}make-catalog.py -f atarva -g hg38 {input.in_json} {output.bed} + bedtools sort -i {output.bed} | bgzip -c > {output.bed_gz} + tabix -p bed {output.bed_gz} + """ + +rule atarva_hg19: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + bed = "{base_dir}catalogs/STRchive-disease-loci.hg19.atarva.bed", + bed_gz = "{base_dir}catalogs/STRchive-disease-loci.hg19.atarva.bed.gz", + tbi = "{base_dir}catalogs/STRchive-disease-loci.hg19.atarva.bed.gz.tbi" + shell: + """ + python {scripts_dir}make-catalog.py -f atarva -g hg19 {input.in_json} {output.bed} + bedtools sort -i {output.bed} | bgzip -c > {output.bed_gz} + tabix -p bed {output.bed_gz} + """ + +rule atarva_T2T: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + bed = "{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed", + bed_gz = "{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed.gz", + tbi = "{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.atarva.bed.gz.tbi" + shell: + """ + python {scripts_dir}make-catalog.py -f atarva -g T2T {input.in_json} {output.bed} + bedtools sort -i {output.bed} | bgzip -c > {output.bed_gz} + tabix -p bed {output.bed_gz} + """ + +rule stranger_hg38: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + results = "{base_dir}catalogs/STRchive-disease-loci.hg38.stranger.json" + shell: + """ + python {scripts_dir}make-catalog.py -f stranger -g hg38 {input.in_json} {output.results} + """ + +rule stranger_hg19: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + results = "{base_dir}catalogs/STRchive-disease-loci.hg19.stranger.json" + shell: + """ + python {scripts_dir}make-catalog.py -f stranger -g hg19 {input.in_json} {output.results} + """ + +rule stranger_T2T: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + results = "{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.stranger.json" + shell: + """ + python {scripts_dir}make-catalog.py -f stranger -g T2T {input.in_json} {output.results} + """ + +rule straglr_hg38: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + bed = "{base_dir}catalogs/STRchive-disease-loci.hg38.straglr.bed", + shell: + """ + python {scripts_dir}make-catalog.py -f straglr -g hg38 {input.in_json} {output.bed} + """ + +rule straglr_hg19: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + bed = "{base_dir}catalogs/STRchive-disease-loci.hg19.straglr.bed", + shell: + """ + python {scripts_dir}make-catalog.py -f straglr -g hg19 {input.in_json} {output.bed} + """ + +rule straglr_T2T: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + bed = "{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.straglr.bed", + shell: + """ + python {scripts_dir}make-catalog.py -f straglr -g T2T {input.in_json} {output.bed} + """ + rule bed_hg38: input: in_json = in_json, check = "{base_dir}check-loci.txt" output: - results = "{base_dir}STRchive-disease-loci.hg38.bed" + results = "{base_dir}catalogs/STRchive-disease-loci.hg38.general.bed" shell: """ python {scripts_dir}make-catalog.py -f bed -g hg38 {input.in_json} {output.results} @@ -213,7 +356,7 @@ rule bed_hg19: in_json = in_json, check = "{base_dir}check-loci.txt" output: - results = "{base_dir}STRchive-disease-loci.hg19.bed" + results = "{base_dir}catalogs/STRchive-disease-loci.hg19.general.bed" shell: """ python {scripts_dir}make-catalog.py -f bed -g hg19 {input.in_json} {output.results} @@ -224,12 +367,45 @@ rule bed_T2T: in_json = in_json, check = "{base_dir}check-loci.txt" output: - results = "{base_dir}STRchive-disease-loci.T2T-chm13.bed" + results = "{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.general.bed" shell: """ python {scripts_dir}make-catalog.py -f bed -g T2T {input.in_json} {output.results} """ +rule longTR_hg38: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + results = "{base_dir}catalogs/STRchive-disease-loci.hg38.longTR.bed" + shell: + """ + python {scripts_dir}make-catalog.py -f longTR -g hg38 {input.in_json} {output.results} + """ + +rule longTR_hg19: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + results = "{base_dir}catalogs/STRchive-disease-loci.hg19.longTR.bed" + shell: + """ + python {scripts_dir}make-catalog.py -f longTR -g hg19 {input.in_json} {output.results} + """ + +rule longTR_T2T: + input: + in_json = in_json, + check = "{base_dir}check-loci.txt" + output: + results = "{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.longTR.bed" + shell: + """ + python {scripts_dir}make-catalog.py -f longTR -g T2T {input.in_json} {output.results} + """ + rule age_onset_plot: input: in_json = in_json, @@ -254,23 +430,24 @@ rule path_size_plot: rule get_ref_alleles: input: - bed_hg19 = "{base_dir}STRchive-disease-loci.hg19.bed", - TRGT_hg19 = "{base_dir}STRchive-disease-loci.hg19.TRGT.bed", - bed_hg38 = "{base_dir}STRchive-disease-loci.hg38.bed", - TRGT_hg38 = "{base_dir}STRchive-disease-loci.hg38.TRGT.bed", - bed_T2T = "{base_dir}STRchive-disease-loci.T2T-chm13.bed", - TRGT_T2T = "{base_dir}STRchive-disease-loci.T2T-chm13.TRGT.bed" + bed_hg19 = "{base_dir}catalogs/STRchive-disease-loci.hg19.bed", + TRGT_hg19 = "{base_dir}catalogs/STRchive-disease-loci.hg19.TRGT.bed", + bed_hg38 = "{base_dir}catalogs/STRchive-disease-loci.hg38.bed", + TRGT_hg38 = "{base_dir}catalogs/STRchive-disease-loci.hg38.TRGT.bed", + bed_T2T = "{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.bed", + TRGT_T2T = "{base_dir}catalogs/STRchive-disease-loci.T2T-chm13.TRGT.bed" output: result_hg19 = "{base_dir}ref-alleles/ref-alleles.hg19.txt", result_hg38 = "{base_dir}ref-alleles/ref-alleles.hg38.txt", result_T2T = "{base_dir}ref-alleles/ref-alleles.T2T-chm13.txt" shell: """ - python {scripts_dir}ref-allele.py {input.bed_hg19} {input.TRGT_hg19} 'https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz' {output.result_hg19} {ref_directory} - python {scripts_dir}ref-allele.py {input.bed_hg38} {input.TRGT_hg38} 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/hg38.analysisSet.fa.gz' {output.result_hg38} {ref_directory} - python {scripts_dir}ref-allele.py {input.bed_T2T} {input.TRGT_T2T} 'https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0_maskedY_rCRS.fa.gz' {output.result_T2T} {ref_directory} + python {scripts_dir}ref-allele.py --beds {input.bed_hg19} {input.TRGT_hg19} --names STRchive TRGT --fasta 'https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz' --out {output.result_hg19} --storage {ref_directory} + python {scripts_dir}ref-allele.py --beds {input.bed_hg38} {input.TRGT_hg38} --names STRchive TRGT --fasta 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/hg38.analysisSet.fa.gz' --out {output.result_hg38} --storage {ref_directory} + python {scripts_dir}ref-allele.py --beds {input.bed_T2T} {input.TRGT_T2T} --names STRchive TRGT --fasta 'https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0_maskedY_rCRS.fa.gz' --out {output.result_T2T} --storage {ref_directory} """ + #Input data not in repo # rule gnomad_plots: