Skip to content

Commit 57a0a43

Browse files
committed
Reduce batch size for API queries, as was dying with incomplete data
1 parent 6d037a0 commit 57a0a43

File tree

1 file changed

+14
-9
lines changed

1 file changed

+14
-9
lines changed

generate_transcript_data/cdot_gene_info.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,18 @@ def batch_iterator(iterable: Iterable[T], batch_size: int = 10) -> Iterator[List
3737

3838

3939
def _get_entrez_gene_summary(id_list):
40-
request = Entrez.epost("gene", id=",".join(id_list))
41-
result = Entrez.read(request)
42-
web_env = result["WebEnv"]
43-
query_key = result["QueryKey"]
44-
data = Entrez.esummary(db="gene", webenv=web_env, query_key=query_key)
45-
document = Entrez.read(data, ignore_errors=True, validate=False) # Need recent BioPython
46-
return document["DocumentSummarySet"]["DocumentSummary"]
47-
40+
for _ in range(3):
41+
try:
42+
request = Entrez.epost("gene", id=",".join(id_list))
43+
result = Entrez.read(request)
44+
web_env = result["WebEnv"]
45+
query_key = result["QueryKey"]
46+
data = Entrez.esummary(db="gene", webenv=web_env, query_key=query_key)
47+
document = Entrez.read(data, ignore_errors=True, validate=False) # Need recent BioPython
48+
return document["DocumentSummarySet"]["DocumentSummary"]
49+
except Exception as e:
50+
logging.warning(e)
51+
logging.warning("Trying again...")
4852

4953
def iter_entrez_ids(reader):
5054
for gi in reader:
@@ -57,7 +61,8 @@ def main():
5761
start_date = datetime.now().isoformat()
5862

5963
# 10k limit of return data from NCBI
60-
NCBI_BATCH_SIZE = 10000
64+
# NCBI_BATCH_SIZE = 10000
65+
NCBI_BATCH_SIZE = 1000
6166

6267
gene_info = {}
6368
with gzip.open(args.gene_info, "rt") as f:

0 commit comments

Comments
 (0)