Skip to content

Commit efd469c

Browse files
committed
Merge branch 'master' of github.com:MG-RAST/MG-RAST-Tools
2 parents 9f17a4b + a2cfd8c commit efd469c

24 files changed

+475
-364
lines changed

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ Repository of scripts and libraries for using the MG-RAST API and MG-RAST data
1919

2020
- Python libs
2121
- prettytable - <https://pypi.python.org/pypi/PrettyTable>
22-
- poster - <https://pypi.python.org/pypi/poster>
23-
For python3, you will need a python3 port of poster: <https://github.com/mrd1no/poster-0.8.1-for-Python-3.4>
2422
- requests - <http://docs.python-requests.org/en/latest>
2523
- requests_toolbelt - <https://github.com/sigmavirus24/requests-toolbelt>
2624
- scipy - <http://www.scipy.org>

examples/python/abundance_matrix.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,20 @@
44

55
from __future__ import print_function
66
import sys
7+
import json
78
from optparse import OptionParser
89
from mglib import async_rest_api, sparse_to_dense, get_auth_token, API_URL
910

1011
DEBUG = 0
1112

1213
if __name__ == '__main__':
13-
usage = "usage: %prog -i <input sequence file> -o <output file>"
14+
usage = "Usage: %prog [options]\nFunction: retrieves data from MG-RAST matrix API, unwraps into csv format"
1415
parser = OptionParser(usage)
1516
parser.add_option("-s", "--source", dest="source", default="RefSeq", help="Annotation source: RefSeq, GenBank, IMG, SEED, TrEMBL, SwissProt, PATRIC, KEG, RDP, Greengenes, LSU, SSU")
1617
parser.add_option("-g", "--grouplevel", dest="grouplevel", default="domain", help="Grouping level: strain, species, genus, family, order, class, phylum, domain / function, level1, level2, level3")
1718
parser.add_option("-i", "--hittype", dest="hittype", default="single", help="Hit type: all, single, lca")
1819
parser.add_option("-c", "--call", dest="call", default="organism", help="organism or function")
20+
parser.add_option("-b", "--biom", dest="biom", action="store_true", help="biom output (csv defaut)")
1921
parser.add_option("-d", "--identity", dest="identity", default=1, help="% identity threshold")
2022
parser.add_option("-e", "--evalue", dest="evalue", default="1", help="organism or function")
2123
parser.add_option("-t", "--type", dest="resulttype", default="abundance", help="Result type: abundnaance, evalue, identity, or length")
@@ -36,35 +38,40 @@
3638
length = opts.length
3739
identity = opts.identity
3840
hittype = opts.hittype
41+
biom = opts.biom
3942
# construct API call
4043
base_url = API_URL + "/matrix/organism"
4144
if opts.call == "function" or opts.source == "SubSystems":
4245
base_url = API_URL + "/matrix/function"
43-
base_url = base_url + "?asynchronous=1&group_level=%s&result_type=%s&auth=%s&source=%s&evalue=%s&length=%s&identity=%s&hittype=%s&" % (group_level, result_type, key, source, evalue, length, identity, hittype)
46+
base_url = base_url + "?asynchronous=1&group_level=%s&result_type=%s&source=%s&evalue=%s&length=%s&identity=%s&hittype=%s&" % (group_level, result_type, source, evalue, length, identity, hittype)
4447
URI = base_url + "&".join(["id=%s" % m for m in metagenomes.split(",")])
4548
print(URI, file=sys.stderr)
46-
print("#"+ URI, file=sys.stdout)
4749
# retrieve the data by sending at HTTP GET request to the MG-RAST API
4850

4951
jsonstructure = async_rest_api(URI, auth=key)
52+
if biom:
53+
print(json.dumps(jsonstructure))
54+
sys.exit()
55+
else:
56+
print("#"+ URI, file=sys.stdout)
5057

51-
# unpack and display the data table
52-
cols = [x["id"] for x in jsonstructure["columns"]]
53-
rows = [x["id"] for x in jsonstructure["rows"]]
54-
matrixtype = jsonstructure["type"]
55-
58+
matrixtype = jsonstructure["data"]["matrix_type"]
5659
if matrixtype == "sparse":
5760
data = sparse_to_dense(jsonstructure["data"], len(rows), len(cols))
5861
else:
5962
data = jsonstructure["data"]
63+
# unpack and display the data table
64+
cols = [x["id"] for x in data["columns"]]
65+
rows = [x["id"] for x in data["rows"]]
66+
6067

6168
if DEBUG:
6269
print(jsonstructure)
6370
print("COLS", cols)
6471
print("ROWS", rows)
6572
print("TYPE", matrixtype)
6673
print("DATA", data)
67-
h = data
74+
h = data["data"]
6875

6976
sys.stdout.write("Taxon\t")
7077
for j in range(0, len(cols)):

examples/python/annotation_table.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,8 @@
99

1010
DEBUG = 0
1111

12-
1312
if __name__ == '__main__':
14-
usage = "usage: %prog -i <input sequence file> -o <output file>"
13+
usage = "usage: %prog [options]\nFunction: retrieves and presents table of sequence IDs and annotation table results"
1514
parser = OptionParser(usage)
1615
parser.add_option("-s", "--source", dest="source", default="RefSeq", help="Annotation source: RefSeq, GenBank, IMG, SEED, TrEMBL, SwissProt, PATRIC, KEG, RDP, Greengenes, LSU, SSU")
1716
parser.add_option("-g", "--grouplevel", dest="grouplevel", default="domain", help="Grouping level: strain, species, genus, family, order, class, phylum, domain / function, level1, level2, level3")
@@ -36,7 +35,7 @@
3635

3736
# construct API call
3837
base_url = API_URL + "/profile/{}".format(metagenomes)
39-
base_url = base_url + "?asynchronous=1&group_level=%s&result_type=%s&auth=%s&source=%s&evalue=%s&" % (group_level, result_type, key, source, evalue)
38+
base_url = base_url + "?asynchronous=1&group_level=%s&result_type=%s&source=%s&evalue=%s&" % (group_level, result_type, source, evalue)
4039
URI = base_url + "&".join(["id=%s" % m for m in metagenomes.split(",")])
4140
URI = base_url
4241
print(URI, file=sys.stderr)
@@ -48,6 +47,7 @@
4847
# rows = [x["id"] for x in jsondata["rows"]]
4948

5049
data = jsondata # ["data"]
50+
data = jsondata["data"]
5151

5252
if DEBUG:
5353
print(jsonstructure)

examples/python/list_all_mg.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,30 @@
1-
#!/usr/bin/env python2
1+
#!/usr/bin/env python
22
'''This script retrieves a list of metagenomes from the MG-RAST API.'''
33
from __future__ import print_function
4-
import urllib
4+
from __future__ import unicode_literals
55
import sys
6+
import time
67

7-
from mglib import get_auth_token, obj_from_url, API_URL
8+
from mglib import get_auth_token, obj_from_url, API_URL, urlencode
89

910
def printlist(js):
1011
'''prints essential fields from metagenome list'''
1112
for item in js["data"]:
1213
if "public" in item.keys():
13-
public = repr(item["public"])
14+
public = item["public"]
1415
else:
1516
public = "False"
16-
sys.stdout.write( ("\t".join([item["metagenome_id"],
17-
# str(len(item.keys())),
18-
public, item["created_on"],
19-
item["name"]]) + "\n").encode("utf-8"))
17+
try:
18+
mg_name= item["name"]
19+
project_id = item["project_id"]
20+
project_name = item["project_name"]
21+
except KeyError:
22+
sys.stderr.write(repr(item) + "\n")
23+
sys.stdout.write(("\t".join([item["metagenome_id"],
24+
# str(len(item.keys())),
25+
repr(public), item["created_on"],
26+
mg_name, project_id, project_name]) + "\n"))
2027

21-
CALL = "/metagenome"
2228
CALL = "/search"
2329

2430
key = get_auth_token()
@@ -27,23 +33,30 @@ def printlist(js):
2733
limit = 1000 # initial call
2834

2935
# construct API call
36+
# public = 0 means "don't show public metagenomes"
37+
parameters = {"limit": limit, "order":"created_on", "direction": "asc", "public": "1"}
38+
API_URL= "https://api.mg-rast.org/"
3039

31-
parameters = {"limit": limit, "auth": key, "order":"created_on", "direction": "asc"}
32-
base_url = API_URL + CALL + "?" + urllib.urlencode(parameters)
40+
base_url = API_URL + CALL + "?" + urlencode(parameters)
3341

3442
# convert the data from a JSON structure to a python data type, a dict of dicts.
35-
jsonstructure = obj_from_url(base_url)
43+
jsonstructure = obj_from_url(base_url, auth=key)
3644

3745
# unpack and display the data table
3846
total_count = int(jsonstructure["total_count"])
3947
sys.stderr.write("Total number of records: {:d}\n".format(total_count))
4048

41-
for i in range(0, total_count / limit +1):
42-
sys.stderr.write("Page {:d}\t".format(i))
43-
jsonstructure = obj_from_url(base_url)
49+
for i in range(0, int(total_count / limit) +2):
50+
# sys.stderr.write("Page {:d}\t".format(i))
51+
sys.stderr.write("Page {:d}\t{}\n".format(i, base_url))
52+
jsonstructure = obj_from_url(base_url, auth=key)
4453
printlist(jsonstructure)
45-
try:
54+
time.sleep(1)
55+
if "next" in jsonstructure.keys():
4656
next_url = jsonstructure["next"]
4757
base_url = next_url
48-
except KeyError:
58+
continue
59+
else:
60+
sys.stderr.write("No next, page {} url {} \n".format(i, base_url))
61+
sys.stderr.write(repr(jsonstructure))
4962
break

mglib/mglib.py

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,26 @@ def body_from_url(url, accept, auth=None, data=None, debug=False, method=None):
7272

7373
# return python struct from JSON output of MG-RAST or Shock API
7474
def obj_from_url(url, auth=None, data=None, debug=False, method=None):
75-
result = body_from_url(url, 'application/json', auth=auth, data=data, debug=debug, method=method)
76-
obj = json.loads(result.read().decode("utf8"))
75+
if type(data) is str:
76+
data=data.encode("utf8")
77+
try:
78+
result = body_from_url(url, 'application/json', auth=auth, data=data, debug=debug, method=method)
79+
read = result.read()
80+
except: # try one more time ConnectionResetError is incompatible with python2
81+
result = body_from_url(url, 'application/json', auth=auth, data=data, debug=debug, method=method)
82+
read = result.read()
83+
if result.headers["content-type"] == "application/x-download" or result.headers["content-type"] == "application/octet-stream":
84+
return(read) # Watch out!
85+
if result.headers["content-type"][0:9] == "text/html": # json decoder won't work
86+
return(read) # Watch out!
87+
if result.headers["content-type"] == "application/json": # If header is set, this should work
88+
data = read.decode("utf8")
89+
obj = json.loads(data)
90+
else:
91+
data = read.decode("utf8")
92+
obj = json.loads(data)
7793
if obj is None:
78-
sys.stderr.write("ERROR: return structure not valid json format\n")
94+
sys.stderr.write("ERROR: return structure not valid json format\n" + repr(data))
7995
sys.exit(1)
8096
if len(list(obj.keys())) == 0:
8197
if debug:
@@ -86,7 +102,7 @@ def obj_from_url(url, auth=None, data=None, debug=False, method=None):
86102
sys.stderr.write("ERROR: %s\n" %obj['ERROR'])
87103
sys.exit(1)
88104
if ('error' in obj) and obj['error']:
89-
if isinstance(obj['error'], basestring):
105+
if isinstance(obj['error'], str):
90106
sys.stderr.write("ERROR:\n%s\n" %obj['error'])
91107
else:
92108
sys.stderr.write("ERROR: %s\n" %obj['error'][0])
@@ -118,28 +134,44 @@ def async_rest_api(url, auth=None, data=None, debug=False, delay=60):
118134
except:
119135
parameters = {"asynchronous": 1}
120136
submit = obj_from_url(url, auth=auth, data=data, debug=debug)
121-
# If "status" is nor present, or if "status" is somehow not "submitted"
137+
# If "status" is nor present, or if "status" is somehow not "submitted"
122138
# assume this is not an asynchronous call and it's done.
123-
if ('status' in submit) and (submit['status'] == 'done') and ('url' in submit):
124-
return submit['data']
125-
if not (('status' in submit) and (submit['status'] == 'submitted') and ('url' in submit)):
139+
if type(submit) == bytes: # can't decode
140+
try:
141+
return decode("utf-8", submit)
142+
except:
143+
return submit
144+
if ('status' in submit) and (submit['status'] != 'submitted') and (submit['status'] != "processing") and ('data' in submit):
126145
return submit
127-
result = obj_from_url(submit['url'], debug=debug)
128-
try:
129-
while result['status'] != 'done':
146+
if not ('url' in submit.keys()):
147+
return submit
148+
# if not (('status' in submit) and (submit['status'] == 'submitted') and ('url' in submit)):
149+
# return submit # No status, no url and no submitted
150+
result = obj_from_url(submit['url'], auth=auth, debug=debug)
151+
if type(result) is bytes:
152+
return(result)
153+
if 'status' in result.keys():
154+
while result['status'] == 'submitted' or result['status'] == "processing":
130155
if debug:
131156
print("waiting %d seconds ..."%delay)
132157
time.sleep(delay)
133-
result = obj_from_url(submit['url'], debug=debug)
134-
except KeyError:
158+
result = obj_from_url(submit['url'], auth=auth, debug=debug)
159+
if 'url' in result.keys() or 'next' in result.keys(): # does not need to wait
160+
return(result)
161+
try:
135162
print("Error in response to "+url, file=sys.stderr)
136-
print("Does not contain 'status' field, likely API syntax error", file=sys.stderr)
137-
print(json.dumps(result), file=sys.stderr)
138-
sys.exit(1)
139-
if 'ERROR' in result['data']:
140-
sys.stderr.write("ERROR: %s\n" %result['data']['ERROR'])
163+
print("Does not contain 'status' or 'next' field, likely API syntax error", file=sys.stderr)
141164
print(json.dumps(result), file=sys.stderr)
142165
sys.exit(1)
166+
except TypeError: # result isn't json, return it anyway
167+
return(result.decode("utf8"))
168+
try:
169+
if 'ERROR' in result['data']:
170+
sys.stderr.write("ERROR: %s\n" %result['data']['ERROR'])
171+
print(json.dumps(result), file=sys.stderr)
172+
sys.exit(1)
173+
except KeyError: # result doesn't have "data"
174+
return result
143175
return result['data']
144176

145177
# POST file to MG-RAST or Shock
@@ -206,14 +238,15 @@ def sparse_to_dense(sMatrix, rmax, cmax):
206238
# transform BIOM format to tabbed table
207239
# returns max value of matrix
208240
def biom_to_tab(biom, hdl, rows=None, use_id=True, col_name=False):
241+
assert 'matrix_type' in biom.keys(), repr(biom)
209242
if biom['matrix_type'] == 'sparse':
210243
matrix = sparse_to_dense(biom['data'], biom['shape'][0], biom['shape'][1])
211244
else:
212245
matrix = biom['data']
213246
if col_name:
214-
hdl.write( "\t%s\n" %"\t".join([c['name'] for c in biom['columns']]) )
247+
hdl.write("\t%s\n" %"\t".join([c['name'] for c in biom['columns']]))
215248
else:
216-
hdl.write( "\t%s\n" %"\t".join([c['id'] for c in biom['columns']]) )
249+
hdl.write("\t%s\n" %"\t".join([c['id'] for c in biom['columns']]))
217250
rowmax = []
218251
for i, row in enumerate(matrix):
219252
name = biom['rows'][i]['id']
@@ -223,7 +256,7 @@ def biom_to_tab(biom, hdl, rows=None, use_id=True, col_name=False):
223256
continue
224257
try:
225258
rowmax.append(max(row))
226-
hdl.write( "%s\t%s\n" %(name, "\t".join(map(str, row))) )
259+
hdl.write("%s\t%s\n" %(name, "\t".join(map(str, row))))
227260
except:
228261
try:
229262
hdl.close()
@@ -254,6 +287,7 @@ def profile_to_matrix(p):
254287
p['matrix_element_type'] = 'int'
255288
p['matrix_element_value'] = 'abundance'
256289
p['date'] = time.strftime("%Y-%m-%d %H:%M:%S")
290+
assert 'matrix_type' in p.keys(), repr(p)
257291
if p['matrix_type'] == 'sparse':
258292
p['data'] = sparse_to_dense(p['data'], p['shape'][0], p['shape'][1])
259293
if trim:
@@ -293,6 +327,7 @@ def merge_biom(b1, b2):
293327
"id": b1['id']+'_'+b2['id'],
294328
"type": b1['type'] }
295329
# make sure we are dense
330+
assert 'matrix_type' in b2.keys(), repr(b2)
296331
if b2['matrix_type'] == 'sparse':
297332
b2['data'] = sparse_to_dense(b2['data'], b2['shape'][0], b2['shape'][1])
298333
# get lists of ids
@@ -344,14 +379,15 @@ def biom_to_matrix(biom, col_name=False, sig_stats=False):
344379
except KeyError:
345380
rows = [r['id'] for r in biom['rows']]
346381
# rows = [";".join(r['metadata']['hierarchy']) for r in biom['rows']]
382+
assert "matrix_type" in biom.keys(), repr(biom)
347383
if biom['matrix_type'] == 'sparse':
348384
data = sparse_to_dense(biom['data'], len(rows), len(cols))
349385
else:
350386
data = biom['data']
351387
if sig_stats and ('significance' in biom['rows'][0]['metadata']) and (len(biom['rows'][0]['metadata']['significance']) > 0):
352-
cols.extend( [s[0] for s in biom['rows'][0]['metadata']['significance']] )
388+
cols.extend([s[0] for s in biom['rows'][0]['metadata']['significance']] )
353389
for i, r in enumerate(biom['rows']):
354-
data[i].extend( [s[1] for s in r['metadata']['significance']] )
390+
data[i].extend([s[1] for s in r['metadata']['significance']] )
355391
return rows, cols, data
356392

357393
# transform tabbed table to matrix in json format
@@ -374,7 +410,7 @@ def sub_matrix(matrix, ncols):
374410
return matrix
375411
sub = list()
376412
for row in matrix:
377-
sub.append( row[:ncols] )
413+
sub.append(row[:ncols] )
378414
return sub
379415

380416
# return KBase id for MG-RAST id
@@ -407,7 +443,7 @@ def kbids_to_mgids(kbids):
407443
def kbid_lookup(ids, reverse=False):
408444
request = 'mg2kb' if reverse else 'kb2mg'
409445
post = json.dumps({'ids': ids}, separators=(',',':'))
410-
data = obj_from_url(API_URL+'/job/'+request, data=post)
446+
data = obj_from_url(API_URL+'/job/'+request, auth=auth, data=post)
411447
return data['data']
412448

413449
def get_auth_token(opts=None):

scripts/mg-abundant-functions.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def main(args):
4848
parser.add_argument("--filter_level", dest="filter_level", default=None, help="function level to filter by")
4949
parser.add_argument("--top", dest="top", type=int, default=10, help="display only the top N taxa, default is 10")
5050
parser.add_argument("--evalue", dest="evalue", type=int, default=5, help="negative exponent value for maximum e-value cutoff, default is 5")
51-
parser.add_argument("--identity", dest="identity", type=int, default=60, help="percent value for minimum % identity cutoff, default is 60")
51+
parser.add_argument("--identity", dest="identity", type=int, default=60, help="percent value for minimum %% identity cutoff, default is 60")
5252
parser.add_argument("--length", dest="length", type=int, default=15, help="value for minimum alignment length cutoff, default is 15")
5353
parser.add_argument("--version", type=int, dest="version", default=1, help="M5NR annotation version to use, default is 1")
5454

@@ -94,8 +94,10 @@ def main(args):
9494
data = obj_from_url(url)
9595
level = 'level4' if opts.level == 'function' else opts.level
9696
sub_ann = set(map(lambda x: x[level], data['data']))
97-
97+
biomorig = biom
98+
biom = biomorig["data"]
9899
# sort data
100+
assert "matrix_type" in biom.keys(), repr(biom)
99101
if biom["matrix_type"] == "sparse":
100102
for d in sorted(biom['data'], key=itemgetter(2), reverse=True):
101103
name = biom['rows'][d[0]]['id'] # if opts.source != 'Subsystems' else biom['rows'][d[0]]['metadata']['ontology'][-1]

0 commit comments

Comments
 (0)