Skip to content

Commit 9283487

Browse files
committed
generate atarva regions files
1 parent 0e3a3d7 commit 9283487

File tree

3 files changed

+145
-8
lines changed

3 files changed

+145
-8
lines changed

scripts/environment.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ dependencies:
2323
- r-stringr
2424
- r-purrr
2525
- pysam
26+
- htslib
27+
- bedtools
2628
- nodejs # build website locally
2729
- pyliftover
2830
- pip

scripts/make-catalog.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,82 @@ def trgt_catalog(row, genome = 'hg38', struc_type = 'default'):
126126

127127
return definition
128128

129+
def atarva_catalog(row, genome = 'hg38'):
130+
r"""
131+
:param row: dictionary with STR data for a single locus
132+
:param genome: genome build (hg19, hg38 or T2T)
133+
:return: atarva format catalog string which is a modified BED format with fields: chrom start stop motif motif_len [id]
134+
135+
Note, compound loci and loci with multiple pathogenic motifs will be split into multiple entries, one for each motif. Overlapping loci are okay.
136+
137+
>>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'flank_motif': '', 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38')
138+
'chr1\t100\t200\tCAG\t3\tmyid'
139+
140+
>>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['AAGGG', 'ACAGG'], 'flank_motif': '', 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38')
141+
'chr1\t100\t200\tAAGGG\t5\tmyid_AAGGG\nchr1\t100\t200\tACAGG\t5\tmyid_ACAGG'
142+
143+
>>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'flank_motif': '(CAG)nCAACAG(CCG)12', 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38')
144+
'chr1\t100\t200\tCAG\t3\tmyid\nchr1\t200\t206\tCAACAG\t6\tmyid_flank\nchr1\t206\t242\tCCG\t3\tmyid_flank'
145+
146+
>>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'flank_motif': '(CAG)n(CCG)10(CAA)10', 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38')
147+
'chr1\t100\t200\tCAG\t3\tmyid\nchr1\t200\t230\tCCG\t3\tmyid_flank\nchr1\t230\t260\tCAA\t3\tmyid_flank'
148+
"""
149+
bed_string = ''
150+
151+
motif_field = 'pathogenic_motif_reference_orientation'
152+
id_field = 'id'
153+
start = int(row['start_' + genome])
154+
stop = int(row['stop_' + genome])
155+
156+
motifs = row[motif_field]
157+
this_id = row[id_field]
158+
159+
# check for multiple motifs
160+
if len(motifs) > 1:
161+
# split into multiple entries
162+
for motif in motifs:
163+
motif_len = len(motif)
164+
this_id = row[id_field] + '_' + motif
165+
bed_string += f"{row['chrom']}\t{start}\t{stop}\t{motif}\t{motif_len}\t{this_id}\n"
166+
else:
167+
motif = motifs[0]
168+
motif_len = len(motif)
169+
bed_string += f"{row['chrom']}\t{start}\t{stop}\t{motif}\t{motif_len}\t{this_id}\n"
170+
171+
# check for flanking motif(s)
172+
if row['flank_motif'] != '' and row['flank_motif'] is not None:
173+
# get motifs in parentheses using regex
174+
flank_motif = row['flank_motif']
175+
176+
split_flank_counts = re.split(r"[ATCGN]+", flank_motif, )
177+
all_flank_motifs_counts = []
178+
for i in range(1, len(split_flank_counts)):
179+
all_flank_motifs_counts.append(split_flank_counts[i].replace('(', '').replace(')', ''))
180+
181+
all_flank_motifs = ''
182+
for char in flank_motif:
183+
if char in ['(', ')', 'n', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
184+
all_flank_motifs += ' '
185+
else:
186+
all_flank_motifs += char
187+
188+
all_flank_motifs = all_flank_motifs.split()
189+
190+
flank_start = stop
191+
flank_stop = stop
192+
for motif, count in zip(all_flank_motifs, all_flank_motifs_counts):
193+
if count == '':
194+
count = 1
195+
if count == 'n':
196+
continue
197+
else:
198+
flank_stop += int(count) * len(motif)
199+
bed_string += f"{row['chrom']}\t{flank_start}\t{flank_stop}\t{motif}\t{len(motif)}\t{this_id}_flank\n"
200+
flank_start = flank_stop
201+
202+
return bed_string.rstrip('\n')
203+
204+
129205
def extended_bed(row, fields = [], genome = 'hg38'):
130206
r"""
131207
:param row: dictionary with STR data for a single locus
@@ -191,6 +267,12 @@ def main(input: str, output: str, *, format: str = 'TRGT', genome: str = 'hg38',
191267
with open(output, 'w') as out_file:
192268
for row in data:
193269
out_file.write(trgt_catalog(row, genome) + '\n')
270+
elif format.lower() == 'atarva':
271+
with open(output, 'w') as out_file:
272+
header = '#' + '\t'.join(['chrom', 'start', 'stop', 'motif', 'motif_len', 'id']) + '\n'
273+
out_file.write(header)
274+
for row in data:
275+
out_file.write(atarva_catalog(row, genome) + '\n')
194276
elif format.lower() == 'bed':
195277
fields_list = fields.split(',')
196278
header = '#' + '\t'.join(['chrom', 'start', 'stop'] + fields_list) + '\n'

workflow/Snakefile

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# Usage:
22

33
# Run all stages (default, takes hours):
4-
# snakemake
4+
# snakemake --cores 'all'
55
# or
6-
# snakemake --config stages="all"
6+
# snakemake --config stages="all" --cores 'all'
77

88
# Skip retrieve and manubot, which will speed things up substantially (runs in seconds):
9-
# snakemake --config stages="skip-refs"
9+
# snakemake --config stages="skip-refs" --cores 'all'
1010

1111
# Fetches new citations but doesn't update exising ones. Slow, but faster than "all".
12-
# snakemake --config stages="new-refs"
12+
# snakemake --config stages="new-refs" --cores 'all'
1313

1414

1515
configfile: 'workflow/config.yaml'
@@ -40,6 +40,10 @@ if stages == "all" or stages == "new-refs":
4040
expand("{base_dir}STRchive-disease-loci.hg38.TRGT.bed", base_dir = base_dir),
4141
expand("{base_dir}STRchive-disease-loci.hg19.TRGT.bed", base_dir = base_dir),
4242
expand("{base_dir}STRchive-disease-loci.T2T-chm13.TRGT.bed", base_dir = base_dir),
43+
# Atarva bed files
44+
expand("{base_dir}STRchive-disease-loci.hg38.atarva.bed.gz", base_dir = base_dir),
45+
expand("{base_dir}STRchive-disease-loci.hg19.atarva.bed.gz", base_dir = base_dir),
46+
expand("{base_dir}STRchive-disease-loci.T2T-chm13.atarva.bed.gz", base_dir = base_dir),
4347
# Extended BED files
4448
expand("{base_dir}STRchive-disease-loci.hg38.bed", base_dir = base_dir),
4549
expand("{base_dir}STRchive-disease-loci.hg19.bed", base_dir = base_dir),
@@ -61,18 +65,22 @@ elif stages == "skip-refs":
6165
expand("{base_dir}STRchive-disease-loci.hg38.TRGT.bed", base_dir = base_dir),
6266
expand("{base_dir}STRchive-disease-loci.hg19.TRGT.bed", base_dir = base_dir),
6367
expand("{base_dir}STRchive-disease-loci.T2T-chm13.TRGT.bed", base_dir = base_dir),
68+
# Atarva bed files
69+
expand("{base_dir}STRchive-disease-loci.hg38.atarva.bed.gz", base_dir = base_dir),
70+
expand("{base_dir}STRchive-disease-loci.hg19.atarva.bed.gz", base_dir = base_dir),
71+
expand("{base_dir}STRchive-disease-loci.T2T-chm13.atarva.bed.gz", base_dir = base_dir),
6472
# Extended BED files
6573
expand("{base_dir}STRchive-disease-loci.hg38.bed", base_dir = base_dir),
6674
expand("{base_dir}STRchive-disease-loci.hg19.bed", base_dir = base_dir),
6775
expand("{base_dir}STRchive-disease-loci.T2T-chm13.bed", base_dir = base_dir),
6876
# Plots
6977
# expand("{base_dir}plots/gnomad.json", base_dir = base_dir),
7078
expand("{base_dir}plots/age-onset.json", base_dir = base_dir),
71-
expand("{base_dir}plots/path-size.json", base_dir = base_dir),
79+
expand("{base_dir}plots/path-size.json", base_dir = base_dir)#,
7280
# Reference alleles
73-
expand("{base_dir}ref-alleles/ref-alleles.hg19.txt", base_dir = base_dir),
74-
expand("{base_dir}ref-alleles/ref-alleles.hg38.txt", base_dir = base_dir),
75-
expand("{base_dir}ref-alleles/ref-alleles.T2T-chm13.txt", base_dir = base_dir)
81+
# expand("{base_dir}ref-alleles/ref-alleles.hg19.txt", base_dir = base_dir),
82+
# expand("{base_dir}ref-alleles/ref-alleles.hg38.txt", base_dir = base_dir),
83+
# expand("{base_dir}ref-alleles/ref-alleles.T2T-chm13.txt", base_dir = base_dir)
7684
else:
7785
raise ValueError("Invalid stages value. Must be 'all', 'new-refs', or 'skip-refs'")
7886

@@ -181,6 +189,51 @@ rule TRGT_T2T:
181189
python {scripts_dir}make-catalog.py -f TRGT -g T2T {input.in_json} {output.results}
182190
"""
183191

192+
rule atarva_hg38:
193+
input:
194+
in_json = in_json,
195+
check = "{base_dir}check-loci.txt"
196+
output:
197+
bed = "{base_dir}STRchive-disease-loci.hg38.atarva.bed",
198+
bed_gz = "{base_dir}STRchive-disease-loci.hg38.atarva.bed.gz",
199+
tbi = "{base_dir}STRchive-disease-loci.hg38.atarva.bed.gz.tbi"
200+
shell:
201+
"""
202+
python {scripts_dir}make-catalog.py -f atarva -g hg38 {input.in_json} {output.bed}
203+
bedtools sort -i {output.bed} | bgzip -c > {output.bed_gz}
204+
tabix -p bed {output.bed_gz}
205+
"""
206+
207+
rule atarva_hg19:
208+
input:
209+
in_json = in_json,
210+
check = "{base_dir}check-loci.txt"
211+
output:
212+
bed = "{base_dir}STRchive-disease-loci.hg19.atarva.bed",
213+
bed_gz = "{base_dir}STRchive-disease-loci.hg19.atarva.bed.gz",
214+
tbi = "{base_dir}STRchive-disease-loci.hg19.atarva.bed.gz.tbi"
215+
shell:
216+
"""
217+
python {scripts_dir}make-catalog.py -f atarva -g hg19 {input.in_json} {output.bed}
218+
bedtools sort -i {output.bed} | bgzip -c > {output.bed_gz}
219+
tabix -p bed {output.bed_gz}
220+
"""
221+
222+
rule atarva_T2T:
223+
input:
224+
in_json = in_json,
225+
check = "{base_dir}check-loci.txt"
226+
output:
227+
bed = "{base_dir}STRchive-disease-loci.T2T-chm13.atarva.bed",
228+
bed_gz = "{base_dir}STRchive-disease-loci.T2T-chm13.atarva.bed.gz",
229+
tbi = "{base_dir}STRchive-disease-loci.T2T-chm13.atarva.bed.gz.tbi"
230+
shell:
231+
"""
232+
python {scripts_dir}make-catalog.py -f atarva -g T2T {input.in_json} {output.bed}
233+
bedtools sort -i {output.bed} | bgzip -c > {output.bed_gz}
234+
tabix -p bed {output.bed_gz}
235+
"""
236+
184237
rule bed_hg38:
185238
input:
186239
in_json = in_json,

0 commit comments

Comments
 (0)