|
| 1 | +from api.utils.bar_utils import BARUtils |
| 2 | + |
| 3 | +import tempfile |
| 4 | +import os |
| 5 | +import subprocess |
| 6 | +from collections import defaultdict |
| 7 | + |
| 8 | + |
| 9 | +class MfinderUtils: |
| 10 | + |
| 11 | + @staticmethod |
| 12 | + # Eliminates same pairs |
| 13 | + def uniq_with(arr, comp_func): |
| 14 | + unique_arr = [] |
| 15 | + for item in arr: |
| 16 | + if not any(comp_func(item, unique_item) for unique_item in unique_arr): |
| 17 | + unique_arr.append(item) |
| 18 | + return unique_arr |
| 19 | + |
| 20 | + @staticmethod |
| 21 | + def is_equal(a, b): |
| 22 | + return a == b |
| 23 | + |
| 24 | + @staticmethod |
| 25 | + def find_key(d, value): |
| 26 | + return next(key for key, val in d.items() if val == value) |
| 27 | + |
| 28 | + # Check if JSON body data obj is an array of arrays (2d arr) |
| 29 | + # ex [ [ "AT1G010100", "AT5G01010" ], ["AT3G10000", "AT2G03240"]] |
| 30 | + # {Array<Array<string>>} input: the above arr |
| 31 | + @staticmethod |
| 32 | + def input_validation(input): |
| 33 | + if not isinstance(input, list): |
| 34 | + return "invalid JSON, not an arr" |
| 35 | + |
| 36 | + if len(input) == 0: |
| 37 | + return "arr length 0!" |
| 38 | + |
| 39 | + if any(len(i) != 2 for i in input): |
| 40 | + return "inner arr length is not of length 2!" |
| 41 | + |
| 42 | + if not all(isinstance(i, list) for i in input): |
| 43 | + return "invalid JSON, check arr members are arrs!" |
| 44 | + |
| 45 | + if not all(isinstance(j, str) for i in input for j in i): |
| 46 | + return "invalid JSON, check if inside arr members are strings!" |
| 47 | + |
| 48 | + if not all(BARUtils.is_arabidopsis_gene_valid(j) for i in input for j in i): |
| 49 | + return "Invalid gene ID contained!" |
| 50 | + |
| 51 | + # filter self-edges and duplicate edges (mFinder does not accept) |
| 52 | + return MfinderUtils.uniq_with([i for i in input if i[0] != i[1]], MfinderUtils.is_equal) |
| 53 | + |
| 54 | + # Some mFinders params allowed within reasonable server load. Namely mFinder takes 3 basic params: nd (non-directed network), |
| 55 | + # r (# of rand networks to gen), s (motif size), u (unique min), z (z-score min). The defaults are directed, 100, 3, 4, & 2 |
| 56 | + # respectively. HOWEVER choose r of 30 for speed |
| 57 | + # Do a validation check on each value too! |
| 58 | + # opts: the JSON settings object, can be empty in which we provide the default |
| 59 | + @staticmethod |
| 60 | + def settings_validation(opts): |
| 61 | + opts = opts or {} |
| 62 | + MfinderUtils.injection_check(opts) |
| 63 | + settings_obj = opts.copy() |
| 64 | + if "nd" not in opts: |
| 65 | + settings_obj["nd"] = False |
| 66 | + elif not isinstance(opts["nd"], bool): |
| 67 | + return "incorrect nd setting - is it boolean?", 400 |
| 68 | + |
| 69 | + if "r" not in opts: |
| 70 | + settings_obj["r"] = 50 |
| 71 | + elif not isinstance(opts["r"], int) or opts["r"] > 150: |
| 72 | + return "incorrect r setting - is it a number under 151?", 400 |
| 73 | + |
| 74 | + if "s" not in opts: |
| 75 | + settings_obj["s"] = 3 |
| 76 | + elif not isinstance(opts["s"], int) or opts["s"] < 2 or opts["s"] > 4: |
| 77 | + return "incorrect s setting - is it a number between 2 and 4?", 400 |
| 78 | + |
| 79 | + if "u" not in opts: |
| 80 | + settings_obj["u"] = 4 |
| 81 | + elif not isinstance(opts["u"], int) or opts["u"] > 999: |
| 82 | + return "incorrect u setting - is it a number or below 1000?", 400 |
| 83 | + |
| 84 | + if "z" not in opts: |
| 85 | + settings_obj["z"] = 2 |
| 86 | + elif not isinstance(opts["z"], int) or opts["z"] > 99: |
| 87 | + return "incorrect z setting - is it a number or below 100?", 400 |
| 88 | + |
| 89 | + return settings_obj |
| 90 | + |
| 91 | + # Check for injection, throw if suspiciously long command is found. |
| 92 | + # object: to validate for injection |
| 93 | + @staticmethod |
| 94 | + def injection_check(obj): |
| 95 | + for key, value in obj.items(): |
| 96 | + if len(str(value)) > 10: |
| 97 | + return f"{key} settings param is too long", 400 |
| 98 | + |
| 99 | + # Take in the filtered array of gene-id pairs (edges) and perform |
| 100 | + # mFinder analysis on them (create temp text files to do so) |
| 101 | + # Performed SYNCHRONOUSLY !!! |
| 102 | + @staticmethod |
| 103 | + def create_files_and_mfinder(input, opts_obj): |
| 104 | + |
| 105 | + # give read/write permissions to user but nada to anybody else |
| 106 | + tmpfile = tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False) |
| 107 | + os.chmod(tmpfile.name, 0o600) |
| 108 | + |
| 109 | + # get a hash of IDs -> numbers for later lookup and writable string |
| 110 | + hash_of_ids, return_str = MfinderUtils.get_gene_id_hash_map(input) |
| 111 | + |
| 112 | + # write to temp file which mFinder will run/read on |
| 113 | + tmpfile.write(return_str) |
| 114 | + tmpfile.flush() |
| 115 | + |
| 116 | + command = ( |
| 117 | + f"/bartmp/mfinder {tmpfile.name} " |
| 118 | + f"-s {opts_obj['s']} " |
| 119 | + f"-r {opts_obj['r']} " |
| 120 | + f"-u {opts_obj['u']} " |
| 121 | + f"-z {opts_obj['z']} " |
| 122 | + f"{'-nd ' if opts_obj.get('nd') else ''}" |
| 123 | + "-omem" |
| 124 | + ) |
| 125 | + subprocess.run(command, shell=True, check=True) |
| 126 | + |
| 127 | + with open(tmpfile.name[:-4] + "_OUT.txt", "r") as stats_file: |
| 128 | + mfinder_stats = stats_file.read() |
| 129 | + |
| 130 | + with open(tmpfile.name[:-4] + "_MEMBERS.txt", "r") as members_file: |
| 131 | + mfinder_members = members_file.read() |
| 132 | + |
| 133 | + tmpfile.close() |
| 134 | + print(f"Temporary file: {tmpfile.name}") |
| 135 | + os.remove(tmpfile.name) |
| 136 | + |
| 137 | + return {"hashOfIds": hash_of_ids, "mFinderStats": mfinder_stats, "mFinderMembers": mfinder_members} |
| 138 | + |
| 139 | + # Take an input of array of array of strings which represent edges and transform those gene IDs (unique!) to a hash table and |
| 140 | + # coinciding edges i.e. [["PHE", "PAT"], ["PAT, "PAN"]] to "232 210 1 \n 210 100 1\n" |
| 141 | + @staticmethod |
| 142 | + def get_gene_id_hash_map(input): |
| 143 | + hash_of_ids = defaultdict(lambda: None) |
| 144 | + iter = 1 |
| 145 | + return_str = "" |
| 146 | + for item in input: |
| 147 | + if item[0] not in hash_of_ids.values(): |
| 148 | + hash_of_ids[iter] = item[0] |
| 149 | + iter += 1 |
| 150 | + if item[1] not in hash_of_ids.values(): |
| 151 | + hash_of_ids[iter] = item[1] |
| 152 | + iter += 1 |
| 153 | + return_str += f"{MfinderUtils.find_key(hash_of_ids, item[0])} {MfinderUtils.find_key(hash_of_ids, item[1])} 1\n" |
| 154 | + |
| 155 | + return hash_of_ids, return_str |
| 156 | + |
| 157 | + # Beautify the output file string and members file string |
| 158 | + @staticmethod |
| 159 | + def beautify_results(mfinder_res_obj): |
| 160 | + stats = mfinder_res_obj["mFinderStats"] |
| 161 | + mems = mfinder_res_obj["mFinderMembers"] |
| 162 | + id_map = mfinder_res_obj["hashOfIds"] |
| 163 | + ret_obj = {"sigMotifs": {}, "motifList": {}} |
| 164 | + |
| 165 | + try: |
| 166 | + sig_motifs_str = stats.split("[MILI]\t\n\n")[1].split("Full")[0].split("\n\n") |
| 167 | + # In case stats has less than 2 parts after split('[MILI]\t\n\n')[1] |
| 168 | + except IndexError: |
| 169 | + raise ValueError("Expected delimiter '[MILI]\t\n\n' or 'Full' not found in the stats string.") |
| 170 | + sig_motifs_str = sig_motifs_str[: len(sig_motifs_str) - 2 : 2] |
| 171 | + for item in sig_motifs_str: |
| 172 | + split_stats_for_motif_id = item.split("\t") |
| 173 | + ret_obj["sigMotifs"][split_stats_for_motif_id[0]] = { |
| 174 | + "numAppearances": split_stats_for_motif_id[1], |
| 175 | + "numAppearancesRand": split_stats_for_motif_id[2], |
| 176 | + "appearancesZScore": split_stats_for_motif_id[3], |
| 177 | + "pValue": split_stats_for_motif_id[4], |
| 178 | + "uniq": split_stats_for_motif_id[5], |
| 179 | + "conc": split_stats_for_motif_id[6], |
| 180 | + } |
| 181 | + |
| 182 | + subgraphs_list_str = mems.split("subgraph id = ")[1:] |
| 183 | + for subgraph_str in subgraphs_list_str: |
| 184 | + member_list_split = subgraph_str.split("\n") |
| 185 | + motif_mem_list = [i.rstrip("\t") for i in member_list_split[5:-2]] |
| 186 | + motif_mem_results = [] |
| 187 | + for i in motif_mem_list: |
| 188 | + three_genes = i.split("\t") |
| 189 | + formatted_str = f"{id_map[int(three_genes[0])]}\t{id_map[int(three_genes[1])]}\t{id_map[int(three_genes[2])]}" # i.e. PAT\tPAN\tEGFR |
| 190 | + motif_mem_results.append(formatted_str) |
| 191 | + ret_obj["motifList"][member_list_split[0]] = motif_mem_results |
| 192 | + |
| 193 | + return BARUtils.success_exit(ret_obj) |
0 commit comments