11from solr import Solr
22import os , sys
3+ reload (sys )
4+ sys .setdefaultencoding ('UTF8' ) # making UTF8 as default encoding
35from argparse import ArgumentParser
46from indexer import parse_lpsc_from_path
57import re
68from utils import canonical_name
79
10+ # Functions to perform reference removal (assumes [n] reference style)
11+ # Written by Karanjeet Singh
12+ def extract_references (content ):
13+ """
14+ Extract references from text
15+ :param content: text
16+ :return: dictionary of references with reference id ([N]) as key
17+ """
18+ references = {}
19+ content = content .replace ("\n " , "\\ n" )
20+ matches = re .findall ('(\[[0-9]+\][^\[]*?(?=\[|Acknowledge|Fig|Table|Conclusion|pdf))' , content )
21+ if matches :
22+ for match in matches :
23+ ref_id = get_reference_id (match )
24+ # No reference id exist -- skip it
25+ if ref_id != - 1 :
26+ value = match .replace ('\\ n' , '\n ' )
27+ references [ref_id ] = value
28+ return references
29+
30+ def get_reference_id (reference ):
31+ """
32+ Extract reference id ([N])
33+ :param reference: Any possible reference
34+ :return: reference id
35+ """
36+ ref_id = - 1
37+ match = re .search ('\[[0-9]+\]' , reference )
38+ if match :
39+ ref_id = int (match .group (0 ).strip ('[]' ))
40+ return ref_id
41+
42+
843class BratAnnIndexer ():
944 '''
1045 This class reads/parses brat annotations from file system and indexes them
@@ -94,36 +129,6 @@ def extract_excerpt(self, content, ann):
94129 sent_end = len (content )
95130 return content [sent_start :sent_end ]
96131
97- def extract_references (self , content ):
98- """
99- Extract references from text
100- :param content: text
101- :return: dictionary of references with reference id ([N]) as key
102- """
103- references = {}
104- content = content .replace ("\n " , "\\ n" )
105- matches = re .findall ('(\[[0-9]+\][^\[]*?(?=\[|Acknowledge|Fig|Table|Conclusion|pdf))' , content )
106- if matches :
107- for match in matches :
108- ref_id = self .get_reference_id (match )
109- # No reference id exist -- skip it
110- if ref_id != - 1 :
111- value = match .replace ('\\ n' , '\n ' )
112- references [ref_id ] = value
113- return references
114-
115- def get_reference_id (self , reference ):
116- """
117- Extract reference id ([N])
118- :param reference: Any possible reference
119- :return: reference id
120- """
121- ref_id = - 1
122- match = re .search ('\[[0-9]+\]' , reference )
123- if match :
124- ref_id = int (match .group (0 ).strip ('[]' ))
125- return ref_id
126-
127132 def read_records (self , in_file ):
128133 '''
129134 Reads brat annotations as solr input documents
@@ -165,7 +170,7 @@ def read_records(self, in_file):
165170 ch ['excerpt_t' ] = self .extract_excerpt (txt , anc_doc )
166171
167172 # Extract references
168- references = self . extract_references (txt )
173+ references = extract_references (txt )
169174
170175 # Remove references from the content
171176 for ref_id in references :
@@ -184,7 +189,7 @@ def read_records(self, in_file):
184189 if 'name' in child :
185190 child ['can_name' ] = canonical_name (child ['name' ])
186191 if 'target_names_ss' in child :
187- child ['target_names_ss' ] = map (canonical_target_name , child ['target_names_ss' ])
192+ child ['target_names_ss' ] = map (canonical_name , child ['target_names_ss' ])
188193 if 'cont_names_ss' in child :
189194 child ['cont_names_ss' ] = map (canonical_name , child ['cont_names_ss' ])
190195 yield child
0 commit comments