Skip to content

Commit a83fb5d

Browse files
committed
Moved reference extraction out of the BratAnnIndexer class.
1 parent 3ae2b05 commit a83fb5d

File tree

1 file changed

+37
-32
lines changed

1 file changed

+37
-32
lines changed

src/parserindexer/brat_ann_indexer.py

Lines changed: 37 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,45 @@
11
from solr import Solr
22
import os, sys
3+
reload(sys)
4+
sys.setdefaultencoding('UTF8') # making UTF8 as default encoding
35
from argparse import ArgumentParser
46
from indexer import parse_lpsc_from_path
57
import re
68
from utils import canonical_name
79

10+
# Functions to perform reference removal (assumes [n] reference style)
11+
# Written by Karanjeet Singh
12+
def extract_references(content):
13+
"""
14+
Extract references from text
15+
:param content: text
16+
:return: dictionary of references with reference id ([N]) as key
17+
"""
18+
references = {}
19+
content = content.replace("\n", "\\n")
20+
matches = re.findall('(\[[0-9]+\][^\[]*?(?=\[|Acknowledge|Fig|Table|Conclusion|pdf))', content)
21+
if matches:
22+
for match in matches:
23+
ref_id = get_reference_id(match)
24+
# No reference id exist -- skip it
25+
if ref_id != -1:
26+
value = match.replace('\\n', '\n')
27+
references[ref_id] = value
28+
return references
29+
30+
def get_reference_id(reference):
31+
"""
32+
Extract reference id ([N])
33+
:param reference: Any possible reference
34+
:return: reference id
35+
"""
36+
ref_id = -1
37+
match = re.search('\[[0-9]+\]', reference)
38+
if match:
39+
ref_id = int(match.group(0).strip('[]'))
40+
return ref_id
41+
42+
843
class BratAnnIndexer():
944
'''
1045
This class reads/parses brat annotations from file system and indexes them
@@ -94,36 +129,6 @@ def extract_excerpt(self, content, ann):
94129
sent_end = len(content)
95130
return content[sent_start:sent_end]
96131

97-
def extract_references(self, content):
98-
"""
99-
Extract references from text
100-
:param content: text
101-
:return: dictionary of references with reference id ([N]) as key
102-
"""
103-
references = {}
104-
content = content.replace("\n", "\\n")
105-
matches = re.findall('(\[[0-9]+\][^\[]*?(?=\[|Acknowledge|Fig|Table|Conclusion|pdf))', content)
106-
if matches:
107-
for match in matches:
108-
ref_id = self.get_reference_id(match)
109-
# No reference id exist -- skip it
110-
if ref_id != -1:
111-
value = match.replace('\\n', '\n')
112-
references[ref_id] = value
113-
return references
114-
115-
def get_reference_id(self, reference):
116-
"""
117-
Extract reference id ([N])
118-
:param reference: Any possible reference
119-
:return: reference id
120-
"""
121-
ref_id = -1
122-
match = re.search('\[[0-9]+\]', reference)
123-
if match:
124-
ref_id = int(match.group(0).strip('[]'))
125-
return ref_id
126-
127132
def read_records(self, in_file):
128133
'''
129134
Reads brat annotations as solr input documents
@@ -165,7 +170,7 @@ def read_records(self, in_file):
165170
ch['excerpt_t'] = self.extract_excerpt(txt, anc_doc)
166171

167172
# Extract references
168-
references = self.extract_references(txt)
173+
references = extract_references(txt)
169174

170175
# Remove references from the content
171176
for ref_id in references:
@@ -184,7 +189,7 @@ def read_records(self, in_file):
184189
if 'name' in child:
185190
child['can_name'] = canonical_name(child['name'])
186191
if 'target_names_ss' in child:
187-
child['target_names_ss'] = map(canonical_target_name, child['target_names_ss'])
192+
child['target_names_ss'] = map(canonical_name, child['target_names_ss'])
188193
if 'cont_names_ss' in child:
189194
child['cont_names_ss'] = map(canonical_name, child['cont_names_ss'])
190195
yield child

0 commit comments

Comments
 (0)