33import logging
44import pickle
55import tarfile
6+ from urllib .request import urlretrieve
67import requests
78import pandas
8- import pystow
99import tqdm
1010
1111from .processor import EvexProcessor
1717standoff_root = 'http://evexdb.org/download/standoff-annotation/version-0.1/'
1818
1919
20- def process_human_events ():
21- """Process all human events available in EVEX."""
20+ def process_human_events (base_folder = None ):
21+ """Process all human events available in EVEX.
22+
23+ Note that unless the standoff files have already been downloaded using the
24+ `download_evex` function, the Statements produced by this function
25+ will not carry any evidence text, agent text and various other metadata
26+ in them for which the standoff files are required.
27+
28+ Parameters
29+ ----------
30+ base_folder : Optional[str]
31+ If provided, the given base folder is used to download the human
32+ network file from EVEX. Otherwise, the `pystow` package is used
33+ to create an `evex` folder within the pystow base path,
34+ typically ~/.data/evex.
35+
36+ Returns
37+ -------
38+ EvexProcessor
39+ An EvexProcessor instance with the extracted INDRA Statements
40+ as its statements attribute.
41+ """
42+ if not base_folder :
43+ import pystow
44+ base_folder = pystow .join ('evex' ).as_posix ()
2245 standoff_index = build_standoff_index ()
23- network_file = pystow .ensure ('evex' , name = 'Homo_sapiens.tar.gz' ,
24- url = human_network )
46+ network_file = os .path .join (base_folder , 'Homo_sapiens.tar.gz' )
47+ if not os .path .exists (network_file ):
48+ urlretrieve (human_network , network_file )
2549 with tarfile .open (network_file , 'r:gz' ) as fh :
2650 relations_file = fh .extractfile ('EVEX_relations_9606.tab' )
2751 articles_file = fh .extractfile ('EVEX_articles_9606.tab' )
@@ -32,16 +56,35 @@ def process_human_events():
3256 return ep
3357
3458
35- def build_standoff_index (cached = True ):
36- """Build an index of publications in standoff bulk archive files."""
37- cache_file = pystow .join ('evex' , name = 'standoff_index.pkl' )
38- if cached and cache_file .exists ():
39- logger .info ('Loading standoff index from %s' % cache_file .as_posix ())
59+ def build_standoff_index (cached = True , base_folder = None ):
60+ """Build an index of publications in standoff bulk archive files.
61+
62+ This index is necessary to figure out which standoff archive the annotations
63+ for a given article are in.
64+
65+ Parameters
66+ ----------
67+ cached: Optional[bool]
68+ If True, the standoff index is cached in the base folder and isn't
69+ regenerated if this function is called again, just reloaded.
70+ This is useful since generating the full standoff file index
71+ can take a long time. Default: True
72+ base_folder : Optional[str]
73+ If provided, the given base folder is used to download the human
74+ network file from EVEX. Otherwise, the `pystow` package is used
75+ to create an `evex` folder within the pystow base path,
76+ typically ~/.data/evex.
77+ """
78+ if not base_folder :
79+ import pystow
80+ base_folder = pystow .join ('evex' ).as_posix ()
81+ cache_file = os .path .join (base_folder , 'standoff_index.pkl' )
82+ if cached and os .path .exists (cache_file ):
83+ logger .info ('Loading standoff index from %s' % cache_file )
4084 with open (cache_file , 'rb' ) as fh :
4185 return pickle .load (fh )
4286 index = {}
43- for fname in tqdm .tqdm (glob .glob (os .path .join (
44- pystow .join ('evex' ).as_posix (), 'batch*' )),
87+ for fname in tqdm .tqdm (glob .glob (os .path .join (base_folder , 'batch*' )),
4588 desc = 'Building standoff index' ):
4689 try :
4790 with tarfile .open (fname , 'r:gz' ) as fh :
@@ -59,11 +102,30 @@ def build_standoff_index(cached=True):
59102 return index
60103
61104
62- def download_evex ():
63- """Download EVEX standoff output."""
105+ def download_evex (base_folder = None ):
106+ """Download EVEX human network and standoff output files.
107+
108+ This function downloads the human network file as well as a large number
109+ of standoff output files. These files are necessary to find evidence text,
110+ agent text and agent coordinates to be used in INDRA. Note that there
111+ are over 4 thousand such files, and the overall size is around 6 GB.
112+
113+ Parameters
114+ ----------
115+ base_folder : Optional[str]
116+ If provided, the given base folder is used to download the human
117+ network file from EVEX. Otherwise, the `pystow` package is used
118+ to create an `evex` folder within the pystow base path,
119+ typically ~/.data/evex.
120+ """
64121 from bs4 import BeautifulSoup
122+ if not base_folder :
123+ import pystow
124+ base_folder = pystow .join ('evex' ).as_posix ()
65125 # Download human network first
66- pystow .ensure ('evex' , name = 'Homo_sapiens.tar.gz' , url = human_network )
126+ fname = os .path .join (base_folder , 'Homo_sapiens.tar.gz' )
127+ if not os .path .exists (fname ):
128+ urlretrieve (human_network , fname )
67129 # Now download all the standoff files
68130 res = requests .get (standoff_root )
69131 soup = BeautifulSoup (res .text , 'html.parser' )
@@ -77,5 +139,6 @@ def download_evex():
77139 for node in soup .find_all ('a' )
78140 if node .get ('href' ).startswith ('batch' )]
79141 for downloadable in downloadables :
80- fname = downloadable .split ('/' )[- 1 ]
81- pystow .ensure ('evex' , name = fname , url = downloadable )
142+ fname = os .path .join (base_folder , downloadable .split ('/' )[- 1 ])
143+ if not os .path .exists (fname ):
144+ urlretrieve (downloadable , fname )
0 commit comments