Skip to content

Commit 0162c68

Browse files
committed
Works around the limitation of 10000 acc IDs by NCBI
1 parent da58340 commit 0162c68

File tree

1 file changed

+12
-13
lines changed

1 file changed

+12
-13
lines changed

back_end.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,28 +34,33 @@ def __init__(self, database, term, outfile, gui):
3434
self.terminated = False
3535
super(Downloader, self).__init__()
3636

37-
3837
def ncbi_search(self, database, term):
3938
"""
4039
Submit search to NCBI and return the records.
4140
"""
4241
self.handle = Entrez.esearch(db=database, term=term, usehistory="y",
43-
retmax=100000000, idtype="acc")
42+
retmax=10, idtype="acc")
4443
self.record = Entrez.read(self.handle)
4544
self.handle.close()
4645

4746
return self.record
4847

49-
50-
def record_processor(self, record):
48+
def record_processor(self, record, database):
5149
"""
5250
Splits the record returned by Entrez into sparate variables and returns
5351
them.
5452
"""
5553
count = int(record["Count"]) # Int
56-
IDs = record["IdList"] # List
5754
webenv = record["WebEnv"] # String
5855
query_key = record["QueryKey"] # String
56+
IDs = []
57+
58+
for i in range(0, count, 10000):
59+
iter_handle = Entrez.efetch(db=database, webenv=webenv,
60+
query_key=query_key, retmax=10000,
61+
rettype="acc", retstart=i)
62+
IDs += [x.rstrip() for x in iter_handle]
63+
iter_handle.close()
5964

6065
assert count == len(IDs)
6166

@@ -68,7 +73,6 @@ def record_processor(self, record):
6873

6974
return count, IDs, webenv, query_key
7075

71-
7276
def main_organizer(self, count, IDs, webenv, query_key, b_size, Run):
7377
"""
7478
Defines what tasks need to be performed, handles NCBI server errors and
@@ -132,7 +136,6 @@ def main_organizer(self, count, IDs, webenv, query_key, b_size, Run):
132136
if self.terminated is False:
133137
self.re_downloader(IDs, webenv, query_key, b_size)
134138

135-
136139
def re_downloader(self, IDs, webenv, query_key, b_size):
137140
"""
138141
Checks for missing sequences.
@@ -162,7 +165,6 @@ def re_downloader(self, IDs, webenv, query_key, b_size):
162165
self.main_organizer(numb_missing, IDs, webenv, query_key,
163166
b_size, 2)
164167

165-
166168
def error_finder(self, target_file):
167169
"""
168170
Looks for errors in the output fasta and retruns a list of necessary
@@ -179,7 +181,6 @@ def error_finder(self, target_file):
179181
target_handle.close()
180182
return verified_ids
181183

182-
183184
def fetch_by_id(self, IDs, b_size):
184185
"""
185186
Fetches NCBI data based on the IDs, rather than a search query. Returns
@@ -195,7 +196,6 @@ def fetch_by_id(self, IDs, b_size):
195196

196197
return data
197198

198-
199199
def fetch_by_history(self, start, b_size, webenv, query_key):
200200
"""
201201
Fetches NCBI data based on the provided search query. Returns the data
@@ -213,7 +213,6 @@ def fetch_by_history(self, start, b_size, webenv, query_key):
213213

214214
return data
215215

216-
217216
def translate_genome(self, acclist):
218217
"""
219218
Translates genome query IDs into a nucleotide query IDs, since NCBI has
@@ -236,7 +235,6 @@ def translate_genome(self, acclist):
236235

237236
return nuc_acc_list
238237

239-
240238
def run_everything(self):
241239
"""
242240
Run the functions in order.
@@ -248,7 +246,8 @@ def run_everything(self):
248246

249247
rec = self.ncbi_search(self.database, self.term)
250248
try:
251-
count, IDs, webenv, query_key = self.record_processor(rec)
249+
count, IDs, webenv, query_key = self.record_processor(rec,
250+
self.database)
252251
except TypeError:
253252
return None
254253
if self.database == "genome":

0 commit comments

Comments
 (0)