Skip to content

Commit cc0a94f

Browse files
committed
Add docstrings for EXEV API and allow custom base folder
1 parent 04c1081 commit cc0a94f

File tree

1 file changed

+80
-17
lines changed

1 file changed

+80
-17
lines changed

indra/sources/evex/api.py

Lines changed: 80 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
import logging
44
import pickle
55
import tarfile
6+
from urllib.request import urlretrieve
67
import requests
78
import pandas
8-
import pystow
99
import tqdm
1010

1111
from .processor import EvexProcessor
@@ -17,11 +17,35 @@
1717
standoff_root = 'http://evexdb.org/download/standoff-annotation/version-0.1/'
1818

1919

20-
def process_human_events():
21-
"""Process all human events available in EVEX."""
20+
def process_human_events(base_folder=None):
21+
"""Process all human events available in EVEX.
22+
23+
Note that unless the standoff files have already been downloaded using the
24+
`download_evex` function, the Statements produced by this function
25+
will not carry any evidence text, agent text and various other metadata
26+
in them for which the standoff files are required.
27+
28+
Parameters
29+
----------
30+
base_folder : Optional[str]
31+
If provided, the given base folder is used to download the human
32+
network file from EVEX. Otherwise, the `pystow` package is used
33+
to create an `evex` folder within the pystow base path,
34+
typically ~/.data/evex.
35+
36+
Returns
37+
-------
38+
EvexProcessor
39+
An EvexProcessor instance with the extracted INDRA Statements
40+
as its statements attribute.
41+
"""
42+
if not base_folder:
43+
import pystow
44+
base_folder = pystow.join('evex').as_posix()
2245
standoff_index = build_standoff_index()
23-
network_file = pystow.ensure('evex', name='Homo_sapiens.tar.gz',
24-
url=human_network)
46+
network_file = os.path.join(base_folder, 'Homo_sapiens.tar.gz')
47+
if not os.path.exists(network_file):
48+
urlretrieve(human_network, network_file)
2549
with tarfile.open(network_file, 'r:gz') as fh:
2650
relations_file = fh.extractfile('EVEX_relations_9606.tab')
2751
articles_file = fh.extractfile('EVEX_articles_9606.tab')
@@ -32,16 +56,35 @@ def process_human_events():
3256
return ep
3357

3458

35-
def build_standoff_index(cached=True):
36-
"""Build an index of publications in standoff bulk archive files."""
37-
cache_file = pystow.join('evex', name='standoff_index.pkl')
38-
if cached and cache_file.exists():
39-
logger.info('Loading standoff index from %s' % cache_file.as_posix())
59+
def build_standoff_index(cached=True, base_folder=None):
60+
"""Build an index of publications in standoff bulk archive files.
61+
62+
This index is necessary to figure out which standoff archive the annotations
63+
for a given article are in.
64+
65+
Parameters
66+
----------
67+
cached: Optional[bool]
68+
If True, the standoff index is cached in the base folder and isn't
69+
regenerated if this function is called again, just reloaded.
70+
This is useful since generating the full standoff file index
71+
can take a long time. Default: True
72+
base_folder : Optional[str]
73+
If provided, the given base folder is used to download the human
74+
network file from EVEX. Otherwise, the `pystow` package is used
75+
to create an `evex` folder within the pystow base path,
76+
typically ~/.data/evex.
77+
"""
78+
if not base_folder:
79+
import pystow
80+
base_folder = pystow.join('evex').as_posix()
81+
cache_file = os.path.join(base_folder, 'standoff_index.pkl')
82+
if cached and os.path.exists(cache_file):
83+
logger.info('Loading standoff index from %s' % cache_file)
4084
with open(cache_file, 'rb') as fh:
4185
return pickle.load(fh)
4286
index = {}
43-
for fname in tqdm.tqdm(glob.glob(os.path.join(
44-
pystow.join('evex').as_posix(), 'batch*')),
87+
for fname in tqdm.tqdm(glob.glob(os.path.join(base_folder, 'batch*')),
4588
desc='Building standoff index'):
4689
try:
4790
with tarfile.open(fname, 'r:gz') as fh:
@@ -59,11 +102,30 @@ def build_standoff_index(cached=True):
59102
return index
60103

61104

62-
def download_evex():
63-
"""Download EVEX standoff output."""
105+
def download_evex(base_folder=None):
106+
"""Download EVEX human network and standoff output files.
107+
108+
This function downloads the human network file as well as a large number
109+
of standoff output files. These files are necessary to find evidence text,
110+
agent text and agent coordinates to be used in INDRA. Note that there
111+
are over 4 thousand such files, and the overall size is around 6 GB.
112+
113+
Parameters
114+
----------
115+
base_folder : Optional[str]
116+
If provided, the given base folder is used to download the human
117+
network file from EVEX. Otherwise, the `pystow` package is used
118+
to create an `evex` folder within the pystow base path,
119+
typically ~/.data/evex.
120+
"""
64121
from bs4 import BeautifulSoup
122+
if not base_folder:
123+
import pystow
124+
base_folder = pystow.join('evex').as_posix()
65125
# Download human network first
66-
pystow.ensure('evex', name='Homo_sapiens.tar.gz', url=human_network)
126+
fname = os.path.join(base_folder, 'Homo_sapiens.tar.gz')
127+
if not os.path.exists(fname):
128+
urlretrieve(human_network, fname)
67129
# Now download all the standoff files
68130
res = requests.get(standoff_root)
69131
soup = BeautifulSoup(res.text, 'html.parser')
@@ -77,5 +139,6 @@ def download_evex():
77139
for node in soup.find_all('a')
78140
if node.get('href').startswith('batch')]
79141
for downloadable in downloadables:
80-
fname = downloadable.split('/')[-1]
81-
pystow.ensure('evex', name=fname, url=downloadable)
142+
fname = os.path.join(base_folder, downloadable.split('/')[-1])
143+
if not os.path.exists(fname):
144+
urlretrieve(downloadable, fname)

0 commit comments

Comments
 (0)