generate atarva regions files

hdashnow · hdashnow · commit 9283487d2476 · 2025-05-22T16:59:14.000-06:00
diff --git a/scripts/environment.yml b/scripts/environment.yml
@@ -23,6 +23,8 @@ dependencies:
   - r-stringr
   - r-purrr
   - pysam
+  - htslib
+  - bedtools
   - nodejs # build website locally
   - pyliftover
   - pip
diff --git a/scripts/make-catalog.py b/scripts/make-catalog.py
@@ -126,6 +126,82 @@ def trgt_catalog(row, genome = 'hg38', struc_type = 'default'):
 
     return definition
 
+def atarva_catalog(row, genome = 'hg38'):
+    r"""
+    :param row: dictionary with STR data for a single locus
+    :param genome: genome build (hg19, hg38 or T2T)
+    :return: atarva format catalog string which is a modified BED format with fields: chrom start stop motif motif_len [id]
+
+    Note, compound loci and loci with multiple pathogenic motifs will be split into multiple entries, one for each motif. Overlapping loci are okay.
+
+    >>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'flank_motif': '', 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38')
+    'chr1\t100\t200\tCAG\t3\tmyid'
+
+    >>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['AAGGG', 'ACAGG'], 'flank_motif': '', 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38')
+    'chr1\t100\t200\tAAGGG\t5\tmyid_AAGGG\nchr1\t100\t200\tACAGG\t5\tmyid_ACAGG'
+
+    >>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'flank_motif': '(CAG)nCAACAG(CCG)12', 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38')
+    'chr1\t100\t200\tCAG\t3\tmyid\nchr1\t200\t206\tCAACAG\t6\tmyid_flank\nchr1\t206\t242\tCCG\t3\tmyid_flank'
+
+    >>> atarva_catalog({'chrom': 'chr1', 'start_hg38': 100, 'stop_hg38': 200, 'pathogenic_motif_reference_orientation': ['CAG'], 'flank_motif': '(CAG)n(CCG)10(CAA)10', 'gene': 'mygene', 'id': 'myid', 'pathogenic_min': 10, 'inheritance': 'AD', 'disease': 'Disease Name'}, 'hg38')
+    'chr1\t100\t200\tCAG\t3\tmyid\nchr1\t200\t230\tCCG\t3\tmyid_flank\nchr1\t230\t260\tCAA\t3\tmyid_flank'
+    """
+    bed_string = ''
+
+    motif_field = 'pathogenic_motif_reference_orientation'
+    id_field = 'id'
+    start = int(row['start_' + genome])
+    stop = int(row['stop_' + genome])
+
+    motifs = row[motif_field]
+    this_id = row[id_field]
+
+    # check for multiple motifs
+    if len(motifs) > 1:
+        # split into multiple entries
+        for motif in motifs:
+            motif_len = len(motif)
+            this_id = row[id_field] + '_' + motif
+            bed_string += f"{row['chrom']}\t{start}\t{stop}\t{motif}\t{motif_len}\t{this_id}\n"
+    else:
+        motif = motifs[0]
+        motif_len = len(motif)
+        bed_string += f"{row['chrom']}\t{start}\t{stop}\t{motif}\t{motif_len}\t{this_id}\n"
+
+    # check for flanking motif(s)
+    if row['flank_motif'] != '' and row['flank_motif'] is not None:
+    # get motifs in parentheses using regex
+        flank_motif = row['flank_motif']
+    
+        split_flank_counts = re.split(r"[ATCGN]+", flank_motif, )
+        all_flank_motifs_counts = []
+        for i in range(1, len(split_flank_counts)):
+            all_flank_motifs_counts.append(split_flank_counts[i].replace('(', '').replace(')', ''))
+
+        all_flank_motifs = ''
+        for char in flank_motif:
+            if char in ['(', ')', 'n', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
+                all_flank_motifs += ' '
+            else:
+                all_flank_motifs += char
+
+        all_flank_motifs = all_flank_motifs.split()
+
+        flank_start = stop
+        flank_stop = stop
+        for motif, count in zip(all_flank_motifs, all_flank_motifs_counts):
+            if count == '':
+                count = 1
+            if count == 'n':
+                continue
+            else:
+                flank_stop += int(count) * len(motif)
+            bed_string += f"{row['chrom']}\t{flank_start}\t{flank_stop}\t{motif}\t{len(motif)}\t{this_id}_flank\n"
+            flank_start = flank_stop
+
+    return bed_string.rstrip('\n')
+
+
 def extended_bed(row, fields = [], genome = 'hg38'):
     r"""
     :param row: dictionary with STR data for a single locus
@@ -191,6 +267,12 @@ def main(input: str, output: str, *, format: str = 'TRGT', genome: str = 'hg38',
         with open(output, 'w') as out_file:
             for row in data:
                 out_file.write(trgt_catalog(row, genome) + '\n')
+    elif format.lower() == 'atarva':
+        with open(output, 'w') as out_file:
+            header = '#' + '\t'.join(['chrom', 'start', 'stop', 'motif', 'motif_len', 'id']) + '\n'
+            out_file.write(header)
+            for row in data:
+                out_file.write(atarva_catalog(row, genome) + '\n')
     elif format.lower() == 'bed':
         fields_list = fields.split(',')
         header = '#' + '\t'.join(['chrom', 'start', 'stop'] + fields_list) + '\n'
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -1,15 +1,15 @@
 # Usage:
 
 # Run all stages (default, takes hours):
-# snakemake
+# snakemake --cores 'all'
 # or
-# snakemake --config stages="all"
+# snakemake --config stages="all" --cores 'all'
 
 # Skip retrieve and manubot, which will speed things up substantially (runs in seconds):
-# snakemake --config stages="skip-refs"
+# snakemake --config stages="skip-refs" --cores 'all'
 
 # Fetches new citations but doesn't update exising ones. Slow, but faster than "all".
-# snakemake --config stages="new-refs"
+# snakemake --config stages="new-refs" --cores 'all'
 
 
 configfile: 'workflow/config.yaml'
@@ -40,6 +40,10 @@ if stages == "all" or stages == "new-refs":
             expand("{base_dir}STRchive-disease-loci.hg38.TRGT.bed", base_dir = base_dir),
             expand("{base_dir}STRchive-disease-loci.hg19.TRGT.bed", base_dir = base_dir),
             expand("{base_dir}STRchive-disease-loci.T2T-chm13.TRGT.bed", base_dir = base_dir),
+            # Atarva bed files
+            expand("{base_dir}STRchive-disease-loci.hg38.atarva.bed.gz", base_dir = base_dir),
+            expand("{base_dir}STRchive-disease-loci.hg19.atarva.bed.gz", base_dir = base_dir),
+            expand("{base_dir}STRchive-disease-loci.T2T-chm13.atarva.bed.gz", base_dir = base_dir),
             # Extended BED files
             expand("{base_dir}STRchive-disease-loci.hg38.bed", base_dir = base_dir),
             expand("{base_dir}STRchive-disease-loci.hg19.bed", base_dir = base_dir),
@@ -61,18 +65,22 @@ elif stages == "skip-refs":
             expand("{base_dir}STRchive-disease-loci.hg38.TRGT.bed", base_dir = base_dir),
             expand("{base_dir}STRchive-disease-loci.hg19.TRGT.bed", base_dir = base_dir),
             expand("{base_dir}STRchive-disease-loci.T2T-chm13.TRGT.bed", base_dir = base_dir),
+            # Atarva bed files
+            expand("{base_dir}STRchive-disease-loci.hg38.atarva.bed.gz", base_dir = base_dir),
+            expand("{base_dir}STRchive-disease-loci.hg19.atarva.bed.gz", base_dir = base_dir),
+            expand("{base_dir}STRchive-disease-loci.T2T-chm13.atarva.bed.gz", base_dir = base_dir),
             # Extended BED files
             expand("{base_dir}STRchive-disease-loci.hg38.bed", base_dir = base_dir),
             expand("{base_dir}STRchive-disease-loci.hg19.bed", base_dir = base_dir),
             expand("{base_dir}STRchive-disease-loci.T2T-chm13.bed", base_dir = base_dir),
             # Plots
             # expand("{base_dir}plots/gnomad.json", base_dir = base_dir),
             expand("{base_dir}plots/age-onset.json", base_dir = base_dir),
-            expand("{base_dir}plots/path-size.json", base_dir = base_dir),
+            expand("{base_dir}plots/path-size.json", base_dir = base_dir)#,
             # Reference alleles
-            expand("{base_dir}ref-alleles/ref-alleles.hg19.txt", base_dir = base_dir),
-            expand("{base_dir}ref-alleles/ref-alleles.hg38.txt", base_dir = base_dir),
-            expand("{base_dir}ref-alleles/ref-alleles.T2T-chm13.txt", base_dir = base_dir)
+            # expand("{base_dir}ref-alleles/ref-alleles.hg19.txt", base_dir = base_dir),
+            # expand("{base_dir}ref-alleles/ref-alleles.hg38.txt", base_dir = base_dir),
+            # expand("{base_dir}ref-alleles/ref-alleles.T2T-chm13.txt", base_dir = base_dir)
 else:
     raise ValueError("Invalid stages value. Must be 'all', 'new-refs', or 'skip-refs'")
 
@@ -181,6 +189,51 @@ rule TRGT_T2T:
         python {scripts_dir}make-catalog.py -f TRGT -g T2T {input.in_json} {output.results}
         """
 
+rule atarva_hg38:
+    input:
+        in_json = in_json,
+        check = "{base_dir}check-loci.txt"
+    output:
+        bed = "{base_dir}STRchive-disease-loci.hg38.atarva.bed",
+        bed_gz = "{base_dir}STRchive-disease-loci.hg38.atarva.bed.gz",
+        tbi = "{base_dir}STRchive-disease-loci.hg38.atarva.bed.gz.tbi"
+    shell:
+        """
+        python {scripts_dir}make-catalog.py -f atarva -g hg38 {input.in_json} {output.bed}
+        bedtools sort -i {output.bed} | bgzip -c > {output.bed_gz}
+        tabix -p bed {output.bed_gz}
+        """
+
+rule atarva_hg19:
+    input:
+        in_json = in_json,
+        check = "{base_dir}check-loci.txt"
+    output:
+        bed = "{base_dir}STRchive-disease-loci.hg19.atarva.bed",
+        bed_gz = "{base_dir}STRchive-disease-loci.hg19.atarva.bed.gz",
+        tbi = "{base_dir}STRchive-disease-loci.hg19.atarva.bed.gz.tbi"
+    shell:
+        """
+        python {scripts_dir}make-catalog.py -f atarva -g hg19 {input.in_json} {output.bed}
+        bedtools sort -i {output.bed} | bgzip -c > {output.bed_gz}
+        tabix -p bed {output.bed_gz}
+        """
+
+rule atarva_T2T:
+    input:
+        in_json = in_json,
+        check = "{base_dir}check-loci.txt"
+    output:
+        bed = "{base_dir}STRchive-disease-loci.T2T-chm13.atarva.bed",
+        bed_gz = "{base_dir}STRchive-disease-loci.T2T-chm13.atarva.bed.gz",
+        tbi = "{base_dir}STRchive-disease-loci.T2T-chm13.atarva.bed.gz.tbi"
+    shell:
+        """
+        python {scripts_dir}make-catalog.py -f atarva -g T2T {input.in_json} {output.bed}
+        bedtools sort -i {output.bed} | bgzip -c > {output.bed_gz}
+        tabix -p bed {output.bed_gz}
+        """
+
 rule bed_hg38:
     input:
         in_json = in_json,