Skip to content

Commit 00899f7

Browse files
committed
Better end-of-sentence checks when generating excerpts.
1 parent f36c491 commit 00899f7

File tree

1 file changed

+5
-4
lines changed

1 file changed

+5
-4
lines changed

src/parserindexer/brat_ann_indexer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,11 @@ def extract_excerpt(self, content, ann):
123123
if m:
124124
sent_start = sent_start + m.start()
125125
# End: next period followed by {space,newline}, or end of document.
126-
sent_end = anchor_end + content[anchor_end:].find('. ')+1
127-
if sent_end <= anchor_end:
128-
sent_end = anchor_end + content[anchor_end:].find('.\n')+1
129-
if sent_end <= anchor_end:
126+
# Better: skip "wt.", "ig." (for Figure), "(e" or ".g"
127+
m = re.search('(?<!(wt|ig|\(e|\.g))\.[ \n]', content[anchor_end:])
128+
if m != None:
129+
sent_end = anchor_end + m.start() + 1
130+
else:
130131
sent_end = len(content)
131132
return content[sent_start:sent_end]
132133

0 commit comments

Comments
 (0)