|
| 1 | +import numpy as np |
| 2 | +import subprocess |
| 3 | +import argparse |
| 4 | +import h5py |
| 5 | +import tempfile |
| 6 | +import os |
| 7 | + |
| 8 | +def fetch_tomtom_args(): |
| 9 | + parser = argparse.ArgumentParser() |
| 10 | + parser.add_argument("-m", "--modisco_h5py", required=True, type=str, help="path to the output .h5py file generated by the run_modisco.py script") |
| 11 | + parser.add_argument("-o", "--output_prefix", required=True, type=str, help="Path and name of the TSV file to store the tomtom output") |
| 12 | + parser.add_argument("-d", "--meme_motif_db", required=True, type=str, help="path to motif database") |
| 13 | + parser.add_argument("-n", "--top_n_matches", type=int, default=3, help="Max number of matches to return from TomTom") |
| 14 | + parser.add_argument("-tt", "--tomtom_exec", type=str, default='tomtom', help="Command to use to execute tomtom") |
| 15 | + parser.add_argument("-th", "--trim_threshold", type=float, default=0.3, help="Trim threshold for trimming long motif, trim to those with at least prob trim_threshold on both ends") |
| 16 | + parser.add_argument("-tm", "--trim_min_length", type=int, default=3, help="Minimum acceptable length of motif after trimming") |
| 17 | + args = parser.parse_args() |
| 18 | + return args |
| 19 | + |
| 20 | + |
| 21 | +def write_meme_file(ppm, bg, fname): |
| 22 | + f = open(fname, 'w') |
| 23 | + f.write('MEME version 4\n\n') |
| 24 | + f.write('ALPHABET= ACGT\n\n') |
| 25 | + f.write('strands: + -\n\n') |
| 26 | + f.write('Background letter frequencies (from unknown source):\n') |
| 27 | + f.write('A %.3f C %.3f G %.3f T %.3f\n\n' % tuple(list(bg))) |
| 28 | + f.write('MOTIF 1 TEMP\n\n') |
| 29 | + f.write('letter-probability matrix: alength= 4 w= %d nsites= 1 E= 0e+0\n' % ppm.shape[0]) |
| 30 | + for s in ppm: |
| 31 | + f.write('%.5f %.5f %.5f %.5f\n' % tuple(s)) |
| 32 | + f.close() |
| 33 | + |
| 34 | + |
| 35 | +def fetch_tomtom_matches(ppm, cwm, background=[0.25, 0.25, 0.25, 0.25], tomtom_exec_path='tomtom', motifs_db='HOCOMOCOv11_core_HUMAN_mono_meme_format.meme', n=5, trim_threshold=0.3, trim_min_length=3): |
| 36 | + |
| 37 | + """Fetches top matches from a motifs database using TomTom. |
| 38 | + Args: |
| 39 | + ppm: position probability matrix- numpy matrix of dimension (N,4) |
| 40 | + background: list with ACGT background probabilities |
| 41 | + tomtom_exec_path: path to TomTom executable |
| 42 | + motifs_db: path to motifs database in meme format |
| 43 | + n: number of top matches to return, ordered by p-value |
| 44 | + temp_dir: directory for storing temp files |
| 45 | + trim_threshold: the ppm is trimmed from left till first position for which |
| 46 | + probability for any base pair >= trim_threshold. Similarly from right. |
| 47 | + Returns: |
| 48 | + list: a list of up to n results returned by tomtom, each entry is a |
| 49 | + dictionary with keys 'Target ID', 'p-value', 'E-value', 'q-value' |
| 50 | + """ |
| 51 | + |
| 52 | + _, fname = tempfile.mkstemp() |
| 53 | + |
| 54 | + score = np.sum(np.abs(cwm), axis=1) |
| 55 | + trim_thresh = np.max(score) * trim_threshold # Cut off anything less than 30% of max score |
| 56 | + pass_inds = np.where(score >= trim_thresh)[0] |
| 57 | + trimmed = ppm[np.min(pass_inds): np.max(pass_inds) + 1] |
| 58 | + |
| 59 | + # can be None of no base has prob>t |
| 60 | + if trimmed is None: |
| 61 | + return [] |
| 62 | + |
| 63 | + # trim and prepare meme file |
| 64 | + write_meme_file(trimmed, background, fname) |
| 65 | + |
| 66 | + # run tomtom |
| 67 | + cmd = '%s -no-ssc -oc . -verbosity 1 -text -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10.0 %s %s' % (tomtom_exec_path, fname, motifs_db) |
| 68 | + #print(cmd) |
| 69 | + out = subprocess.check_output(cmd, shell=True) |
| 70 | + |
| 71 | + # prepare output |
| 72 | + dat = [x.split('\\t') for x in str(out).split('\\n')] |
| 73 | + schema = dat[0] |
| 74 | + |
| 75 | + # meme v4 vs v5: |
| 76 | + if 'Target ID' in schema: |
| 77 | + tget_idx = schema.index('Target ID') |
| 78 | + else: |
| 79 | + tget_idx = schema.index('Target_ID') |
| 80 | + |
| 81 | + pval_idx, eval_idx, qval_idx =schema.index('p-value'), schema.index('E-value'), schema.index('q-value') |
| 82 | + |
| 83 | + r = [] |
| 84 | + for t in dat[1:min(1+n, len(dat)-1)]: |
| 85 | + if t[0]=='': |
| 86 | + break |
| 87 | + |
| 88 | + mtf = {} |
| 89 | + mtf['Target_ID'] = t[tget_idx] |
| 90 | + mtf['p-value'] = float(t[pval_idx]) |
| 91 | + mtf['E-value'] = float(t[eval_idx]) |
| 92 | + mtf['q-value'] = float(t[qval_idx]) |
| 93 | + r.append(mtf) |
| 94 | + |
| 95 | + os.system('rm ' + fname) |
| 96 | + return r |
| 97 | + |
| 98 | + |
| 99 | +def main(): |
| 100 | + args = fetch_tomtom_args() |
| 101 | + |
| 102 | + modisco_results = h5py.File(args.modisco_h5py, 'r') |
| 103 | + |
| 104 | + # get pfms |
| 105 | + ppms = [] |
| 106 | + cwms = [] |
| 107 | + seqlet_tally = [] |
| 108 | + names = [] |
| 109 | + |
| 110 | + for metacluster_name in modisco_results['metacluster_idx_to_submetacluster_results']: |
| 111 | + metacluster = modisco_results['metacluster_idx_to_submetacluster_results'][metacluster_name] |
| 112 | + all_pattern_names = [x.decode("utf-8") for x in list(metacluster["seqlets_to_patterns_result"]["patterns"]["all_pattern_names"][:])] |
| 113 | + |
| 114 | + for pattern_name in all_pattern_names: |
| 115 | + |
| 116 | + ppm = np.array(metacluster['seqlets_to_patterns_result']['patterns'][pattern_name]['sequence']['fwd']) |
| 117 | + num_seqlets = len(metacluster['seqlets_to_patterns_result']['patterns'][pattern_name]['seqlets_and_alnmts']['seqlets']) |
| 118 | + cwm = np.array(metacluster['seqlets_to_patterns_result']['patterns'][pattern_name]["task0_contrib_scores"]['fwd']) |
| 119 | + |
| 120 | + ppms.append(ppm) |
| 121 | + seqlet_tally.append(num_seqlets) |
| 122 | + cwms.append(cwm) |
| 123 | + names.append(metacluster_name + '.' + pattern_name) |
| 124 | + |
| 125 | + modisco_results.close() |
| 126 | + |
| 127 | + res = [] |
| 128 | + |
| 129 | + for i,x in enumerate(ppms): |
| 130 | + res.append(fetch_tomtom_matches(x, cwms[i], tomtom_exec_path=args.tomtom_exec, motifs_db=args.meme_motif_db, |
| 131 | + n=args.top_n_matches, trim_threshold=args.trim_threshold, trim_min_length=args.trim_min_length)) |
| 132 | + |
| 133 | + # write output. Skipping those patterns which disappear after trimming or have no matches |
| 134 | + with open(args.output_prefix, 'w') as f: |
| 135 | + # write header |
| 136 | + f.write("Pattern") |
| 137 | + f.write("\tNum_Seqlets") |
| 138 | + |
| 139 | + for i in range(args.top_n_matches): |
| 140 | + f.write("\tMatch_{}\tq-value".format(i+1)) |
| 141 | + f.write("\n") |
| 142 | + |
| 143 | + assert len(res) == len(names) |
| 144 | + |
| 145 | + for i,r in enumerate(res): |
| 146 | + f.write(names[i]) |
| 147 | + f.write("\t{}".format(seqlet_tally[i])) |
| 148 | + for match in r: |
| 149 | + f.write("\t{}\t{}".format(match['Target_ID'], match['q-value'])) |
| 150 | + |
| 151 | + # when fewer than n matches are found |
| 152 | + if len(r) != args.top_n_matches: |
| 153 | + f.write("\t\t"*(args.top_n_matches-len(r))) |
| 154 | + f.write("\n") |
| 155 | +if __name__=="__main__": |
| 156 | + main() |
0 commit comments