|
| 1 | +import csv |
| 2 | +import fnmatch |
| 3 | + |
| 4 | +regtools_file = '/Users/kcotto/Downloads/junction_pvalues_significant_0.05_filtered_BH_default.tsv' |
| 5 | +spliceAI_file = '/Users/kcotto/Downloads/all_variants_sorted_spliceAI.vcf' |
| 6 | +gtex_file = '/Users/kcotto/Projects/regtools/TCGA_paper/GTEx_matrices_avg/Liver.tsv' |
| 7 | +base_file = regtools_file.split('.')[0] |
| 8 | +tmp_file = f'{base_file}_tmp.tsv' |
| 9 | +final_output_file = f'{base_file}_gtex_spliceai.tsv' |
| 10 | + |
| 11 | + |
| 12 | +def annotate_GTEx(regtools_input_file, gtex_file): |
| 13 | + with open(regtools_input_file, 'r') as regtools, open(gtex_file, 'r') as gtex: |
| 14 | + gtex_values = {} |
| 15 | + gtex_reader = csv.reader(gtex, delimiter='\t') |
| 16 | + regtools_reader = csv.reader(regtools, delimiter='\t') |
| 17 | + next(gtex_reader) |
| 18 | + header = next(regtools_reader) |
| 19 | + outputfile = tmp_file |
| 20 | + match_count = 0 |
| 21 | + with open(outputfile, 'w') as outfile: |
| 22 | + reformatted_header = '\t'.join(header) |
| 23 | + outfile.write(f'{reformatted_header}\tGTEx_mean\tGTEx_sd\n') |
| 24 | + for line in gtex_reader: |
| 25 | + mean = line[2] |
| 26 | + stdev = line[3] |
| 27 | + gtex_key = line[0] |
| 28 | + gtex_values[gtex_key] = (mean, stdev) |
| 29 | + for line in regtools_reader: |
| 30 | + chrom = line[1] |
| 31 | + junction_start = str(int(line[2]) + 1) |
| 32 | + junction_end = str(int(line[3]) -1) |
| 33 | + regtools_key = f'{chrom}_{junction_start}_{junction_end}' |
| 34 | + output_line = '\t'.join(line) |
| 35 | + if regtools_key in gtex_values: |
| 36 | + match_count += 1 |
| 37 | + print(f'match for {output_line}') |
| 38 | + outfile.write(f'{output_line}\t{gtex_values[regtools_key][0]}\t{gtex_values[regtools_key][1]}\n') |
| 39 | + else: |
| 40 | + outfile.write(f'{output_line}\tNA\tNA\n') |
| 41 | + print(match_count) |
| 42 | + |
| 43 | +def annotate_spliceAI(regtools_input_file, spliceAI_file): |
| 44 | + with open(regtools_input_file, 'r') as regtools, open(spliceAI_file, 'r') as spliceAI: |
| 45 | + spliceAI_reader = csv.reader(spliceAI, delimiter='\t') |
| 46 | + regtools_reader = csv.reader(regtools, delimiter='\t') |
| 47 | + spliceAI_values = {} |
| 48 | + for line in spliceAI_reader: |
| 49 | + li = str(line[0]).strip() |
| 50 | + if not li.startswith('#'): |
| 51 | + info_field = line[7] |
| 52 | + if str(info_field).__contains__('SpliceAI'): |
| 53 | + pattern = 'SpliceAI*' |
| 54 | + extract_SF = line[7].split(';') |
| 55 | + matching = fnmatch.filter(extract_SF, pattern) |
| 56 | + spliceAI_variant_key = f'{line[0]}:{line[1]}' |
| 57 | + spliceAI_string = matching[0] |
| 58 | + if spliceAI_string.__contains__(','): |
| 59 | + split_values = spliceAI_string.split(',') |
| 60 | + spliceAI_values[spliceAI_variant_key] = split_values[0] |
| 61 | + else: |
| 62 | + spliceAI_values[spliceAI_variant_key] = spliceAI_string |
| 63 | + header = next(regtools_reader) |
| 64 | + outputfile = final_output_file |
| 65 | + with open (outputfile, 'w') as outfile: |
| 66 | + reformatted_header = '\t'.join(header) |
| 67 | + outfile.write(f'{reformatted_header}\tSpliceAI_raw\tSpliceAI_match\n') |
| 68 | + for line in regtools_reader: |
| 69 | + chrom = line[6].split(':')[0] |
| 70 | + variant = line[6].split('-')[-1] |
| 71 | + regtools_variant_key = f'{chrom}:{variant}' |
| 72 | + output_line = '\t'.join(line) |
| 73 | + if regtools_variant_key in spliceAI_values: |
| 74 | + junction_start_match = False |
| 75 | + junction_end_match = False |
| 76 | + junction_start = int(line[2]) |
| 77 | + junction_end = int(line[3]) |
| 78 | + spliceAI_info = spliceAI_values[regtools_variant_key] |
| 79 | + spliceAI_split = spliceAI_info.split('|') |
| 80 | + gene = spliceAI_split[1] |
| 81 | + positions = spliceAI_split[-4:] |
| 82 | + for position in positions: |
| 83 | + genomic_location = int(variant) + int(position) |
| 84 | + if junction_start == genomic_location: |
| 85 | + junction_start_match = True |
| 86 | + if junction_end == genomic_location: |
| 87 | + junction_end_match = True |
| 88 | + if junction_start_match and junction_end_match: |
| 89 | + outfile.write(f'{output_line}\t{spliceAI_info}\tjunction start and end match\n') |
| 90 | + elif junction_start_match: |
| 91 | + outfile.write(f'{output_line}\t{spliceAI_info}\tjunction start match\n') |
| 92 | + elif junction_end_match: |
| 93 | + outfile.write(f'{output_line}\t{spliceAI_info}\tjunction end match\n') |
| 94 | + else: |
| 95 | + outfile.write(f'{output_line}\t{spliceAI_info}\tNA\n') |
| 96 | + else: |
| 97 | + outfile.write(f'{output_line}\tNA\tNA\n') |
| 98 | + |
| 99 | + |
| 100 | +annotate_GTEx(regtools_file, gtex_file) |
| 101 | +annotate_spliceAI(tmp_file, spliceAI_file) |
0 commit comments