Skip to content
This repository was archived by the owner on May 6, 2021. It is now read-only.

Commit 979f7f6

Browse files
committed
added support for tag occurrences in text processor
1 parent c8cd933 commit 979f7f6

File tree

3 files changed

+209
-54
lines changed

3 files changed

+209
-54
lines changed

src/main/java/com/graphaware/nlp/domain/Sentence.java

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
*/
1616
package com.graphaware.nlp.domain;
1717

18+
import com.graphaware.common.util.Pair;
1819
import static com.graphaware.nlp.domain.SentimentLabels.*;
1920
import static com.graphaware.nlp.domain.Labels.Sentence;
2021
import static com.graphaware.nlp.domain.Properties.HASH;
@@ -33,6 +34,8 @@
3334
public class Sentence implements Persistable {
3435

3536
private final Map<String, Tag> tags;
37+
private Map<Integer, TagOccurrence> tagOccurrences;
38+
3639
private final String sentence;
3740
private int sentiment = -1;
3841
private boolean store = false;
@@ -73,6 +76,30 @@ public String getId() {
7376
return id;
7477
}
7578

79+
public void addOccurrence(int begin, int end, Tag tag) {
80+
if (begin < 0) {
81+
throw new RuntimeException("Begin cannot be negative (for tag: " + tag.getLemma() + ")" );
82+
}
83+
if (tagOccurrences == null) {
84+
tagOccurrences = new HashMap<>();
85+
}
86+
//Will update end if already exist
87+
tagOccurrences.put(begin, new TagOccurrence(tag, begin, end));
88+
}
89+
90+
//Currently used only for testing purpose
91+
public Tag getTagOccurrence(int begin) {
92+
if (begin < 0) {
93+
throw new RuntimeException("Begin cannot be negative");
94+
}
95+
TagOccurrence occurrence = tagOccurrences.get(begin);
96+
if (occurrence != null) {
97+
return occurrence.getTag();
98+
} else {
99+
return null;
100+
}
101+
}
102+
76103
@Override
77104
public Node storeOnGraph(GraphDatabaseService database) {
78105
Node sequenceNode = checkIfExist(database, id);
@@ -138,4 +165,23 @@ private Node checkIfExist(GraphDatabaseService database, Object id) {
138165
}
139166
return null;
140167
}
168+
169+
class TagOccurrence {
170+
171+
private final Tag tag;
172+
private final Pair<Integer, Integer> span;
173+
174+
public TagOccurrence(Tag tag, int begin, int end) {
175+
this.tag = tag;
176+
this.span = new Pair<>(begin, end);
177+
}
178+
179+
public Tag getTag() {
180+
return tag;
181+
}
182+
183+
public Pair<Integer, Integer> getSpan() {
184+
return span;
185+
}
186+
}
141187
}

src/main/java/com/graphaware/nlp/processor/TextProcessor.java

Lines changed: 128 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -101,72 +101,92 @@ public AnnotatedText annotateText(String text, Object id, boolean sentiment, boo
101101
pipelines.get(PIPELINE.BASIC).annotate(document);
102102
}
103103
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
104-
final String background = backgroundSymbol;
105104
final AtomicInteger sentenceSequence = new AtomicInteger(0);
106105
sentences.stream().map((sentence) -> {
107106
return sentence;
108107
}).forEach((sentence) -> {
109108
String sentenceId = id + "_" + sentenceSequence.getAndIncrement();
110109
final Sentence newSentence = new Sentence(sentence.toString(), store, sentenceId);
111-
final AtomicReference<String> prevNe = new AtomicReference<>();
112-
prevNe.set(background);
113-
final AtomicReference<StringBuilder> sb = new AtomicReference<>();
114-
sb.set(new StringBuilder());
115-
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
116-
tokens.stream()
117-
.filter((token) -> (token != null) && checkPuntuation(token.get(CoreAnnotations.LemmaAnnotation.class)))
118-
.map((token) -> {
119-
//
120-
String currentNe = StringUtils.getNotNullString(token.get(CoreAnnotations.NamedEntityTagAnnotation.class));
121-
if (currentNe.equals(background) && prevNe.get().equals(background)) {
122-
Tag tag = getTag(token);
123-
if (tag != null) {
124-
newSentence.addTag(tag);
125-
}
126-
} else if (currentNe.equals(background) && !prevNe.get().equals(background)) {
127-
Tag newTag = new Tag(sb.get().toString());
128-
newTag.setNe(prevNe.get());
129-
newSentence.addTag(newTag);
130-
sb.set(new StringBuilder());
131-
Tag tag = getTag(token);
132-
if (tag != null) {
133-
newSentence.addTag(tag);
134-
}
135-
} else if (!currentNe.equals(prevNe.get()) && !prevNe.get().equals(background)) {
136-
Tag newTag = new Tag(sb.get().toString());
137-
newTag.setNe(prevNe.get());
138-
newSentence.addTag(newTag);
139-
sb.set(new StringBuilder());
140-
sb.get().append(StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class)));
141-
} else if (!currentNe.equals(background) && prevNe.get().equals(background)) {
142-
sb.get().append(StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class)));
143-
} else {
144-
String before = StringUtils.getNotNullString(token.get(CoreAnnotations.BeforeAnnotation.class));
145-
String currentText = StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class));
146-
sb.get().append(before);
147-
sb.get().append(currentText);
148-
}
149-
return currentNe;
150-
}).forEach((currentNe) -> {
151-
prevNe.set(currentNe);
152-
});
153-
154-
if (sb.get().length() > 0) {
155-
Tag tag = new Tag(sb.get().toString());
156-
tag.setNe(prevNe.get());
157-
newSentence.addTag(tag);
158-
}
159-
110+
extractTokens(sentence, newSentence);
160111
if (sentiment) {
161-
int score = extractSentiment(sentence);
162-
newSentence.setSentiment(score);
112+
extractSentiment(sentence, newSentence);
163113
}
164114
result.addSentence(newSentence);
165115

166116
});
167117
return result;
168118
}
169119

120+
protected void extractSentiment(CoreMap sentence, final Sentence newSentence) {
121+
int score = extractSentiment(sentence);
122+
newSentence.setSentiment(score);
123+
}
124+
125+
protected void extractTokens(CoreMap sentence, final Sentence newSentence) {
126+
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
127+
final String background = backgroundSymbol;
128+
// final AtomicReference<String> prevNe = new AtomicReference<>();
129+
// prevNe.set(background);
130+
// final AtomicReference<StringBuilder> sb = new AtomicReference<>();
131+
// sb.set(new StringBuilder());
132+
TokenHolder currToken = new TokenHolder();
133+
currToken.setNe(background);
134+
tokens.stream()
135+
.filter((token) -> (token != null) && checkPuntuation(token.get(CoreAnnotations.LemmaAnnotation.class)))
136+
.map((token) -> {
137+
//
138+
String currentNe = StringUtils.getNotNullString(token.get(CoreAnnotations.NamedEntityTagAnnotation.class));
139+
if (currentNe.equals(background) && currToken.getNe().equals(background)) {
140+
Tag tag = getTag(token);
141+
if (tag != null) {
142+
newSentence.addTag(tag);
143+
newSentence.addOccurrence(token.beginPosition(), token.endPosition(), tag);
144+
}
145+
} else if (currentNe.equals(background) && !currToken.getNe().equals(background)) {
146+
Tag newTag = new Tag(currToken.getToken());
147+
newTag.setNe(currToken.getNe());
148+
newSentence.addTag(newTag);
149+
newSentence.addOccurrence(currToken.getBeginPosition(), currToken.getEndPosition(), newTag);
150+
currToken.reset();
151+
Tag tag = getTag(token);
152+
if (tag != null) {
153+
newSentence.addTag(tag);
154+
newSentence.addOccurrence(token.beginPosition(), token.endPosition(), tag);
155+
}
156+
} else if (!currentNe.equals(currToken.getNe()) && !currToken.getNe().equals(background)) {
157+
Tag tag = new Tag(currToken.getToken());
158+
tag.setNe(currToken.getNe());
159+
newSentence.addTag(tag);
160+
newSentence.addOccurrence(currToken.getBeginPosition(), currToken.getEndPosition(), tag);
161+
currToken.reset();
162+
currToken.updateToken(StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class)));
163+
currToken.setBeginPosition(token.beginPosition());
164+
currToken.setEndPosition(token.endPosition());
165+
} else if (!currentNe.equals(background) && currToken.getNe().equals(background)) {
166+
currToken.updateToken(StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class)));
167+
currToken.setBeginPosition(token.beginPosition());
168+
currToken.setEndPosition(token.endPosition());
169+
} else {
170+
String before = StringUtils.getNotNullString(token.get(CoreAnnotations.BeforeAnnotation.class));
171+
String currentText = StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class));
172+
currToken.updateToken(before);
173+
currToken.updateToken(currentText);
174+
currToken.setBeginPosition(token.beginPosition());
175+
currToken.setEndPosition(token.endPosition());
176+
}
177+
return currentNe;
178+
}).forEach((currentNe) -> {
179+
currToken.setNe(currentNe);
180+
});
181+
182+
if (currToken.getToken().length() > 0) {
183+
Tag tag = new Tag(currToken.getToken());
184+
tag.setNe(currToken.getNe());
185+
newSentence.addTag(tag);
186+
newSentence.addOccurrence(currToken.getBeginPosition(), currToken.getEndPosition(), tag);
187+
}
188+
}
189+
170190
public AnnotatedText sentiment(AnnotatedText annotated) {
171191
StanfordCoreNLP pipeline = pipelines.get(PIPELINE.SENTIMENT);
172192
annotated.getSentences().parallelStream().forEach((item) -> {
@@ -175,8 +195,7 @@ public AnnotatedText sentiment(AnnotatedText annotated) {
175195
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
176196
Optional<CoreMap> sentence = sentences.stream().findFirst();
177197
if (sentence != null && sentence.isPresent()) {
178-
int score = extractSentiment(sentence.get());
179-
item.setSentiment(score);
198+
extractSentiment(sentence.get(), item);
180199
}
181200
});
182201
return annotated;
@@ -250,6 +269,61 @@ public boolean checkPuntuation(String value) {
250269
return !match.find();
251270
}
252271

272+
class TokenHolder {
273+
274+
private String ne;
275+
private StringBuilder sb;
276+
private int beginPosition;
277+
private int endPosition;
278+
279+
public TokenHolder() {
280+
reset();
281+
}
282+
283+
public String getNe() {
284+
return ne;
285+
}
286+
287+
public String getToken() {
288+
if (sb == null) {
289+
return " - ";
290+
}
291+
return sb.toString();
292+
}
293+
294+
public int getBeginPosition() {
295+
return beginPosition;
296+
}
297+
298+
public int getEndPosition() {
299+
return endPosition;
300+
}
301+
302+
public void setNe(String ne) {
303+
this.ne = ne;
304+
}
305+
306+
public void updateToken(String tknStr) {
307+
this.sb.append(tknStr);
308+
}
309+
310+
public void setBeginPosition(int beginPosition) {
311+
if (this.beginPosition < 0) {
312+
this.beginPosition = beginPosition;
313+
}
314+
}
315+
316+
public void setEndPosition(int endPosition) {
317+
this.endPosition = endPosition;
318+
}
319+
320+
public final void reset() {
321+
sb = new StringBuilder();
322+
beginPosition = -1;
323+
endPosition = -1;
324+
}
325+
}
326+
253327
static class PipelineBuilder {
254328

255329
private final Properties properties = new Properties();

src/test/java/com/graphaware/nlp/processor/TextProcessorTest.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import com.graphaware.nlp.conceptnet5.ConceptNet5Importer;
1919
import com.graphaware.nlp.domain.AnnotatedText;
20+
import com.graphaware.nlp.domain.Sentence;
2021
import com.graphaware.nlp.domain.Tag;
2122
import com.graphaware.nlp.persistence.GraphPersistence;
2223
import com.graphaware.nlp.persistence.LocalGraphDatabase;
@@ -27,6 +28,7 @@
2728
import java.util.Map;
2829
import static org.junit.Assert.assertEquals;
2930
import static org.junit.Assert.assertFalse;
31+
import static org.junit.Assert.assertNull;
3032
import org.junit.Test;
3133
import org.neo4j.graphdb.Node;
3234
import org.neo4j.graphdb.QueryExecutionException;
@@ -149,4 +151,37 @@ public void testSentiment() {
149151
assertEquals(1, annotateText.getSentences().size());
150152
assertEquals(4, annotateText.getSentences().get(0).getSentiment());
151153
}
154+
155+
@Test
156+
public void testAnnotatedTextWithPosition() {
157+
TextProcessor textProcessor = new TextProcessor();
158+
AnnotatedText annotateText = textProcessor.annotateText("On 8 May 2013, "
159+
+ "one week before the Pakistani election, the third author, "
160+
+ "in his keynote address at the Sentiment Analysis Symposium, "
161+
+ "forecast the winner of the Pakistani election. The chart "
162+
+ "in Figure 1 shows varying sentiment on the candidates for "
163+
+ "prime minister of Pakistan in that election. The next day, "
164+
+ "the BBC’s Owen Bennett Jones, reporting from Islamabad, wrote "
165+
+ "an article titled “Pakistan Elections: Five Reasons Why the "
166+
+ "Vote is Unpredictable,”1 in which he claimed that the election "
167+
+ "was too close to call. It was not, and despite his being in Pakistan, "
168+
+ "the outcome of the election was exactly as we predicted.", 1, false, false);
169+
170+
assertEquals(4, annotateText.getSentences().size());
171+
Sentence sentence1 = annotateText.getSentences().get(0);
172+
assertEquals(15, sentence1.getTags().size());
173+
174+
assertNull(sentence1.getTagOccurrence(0));
175+
assertEquals("8 May 2013", sentence1.getTagOccurrence(3).getLemma());
176+
assertEquals("one week", sentence1.getTagOccurrence(15).getLemma());
177+
assertEquals("before", sentence1.getTagOccurrence(24).getLemma());
178+
assertEquals("third", sentence1.getTagOccurrence(59).getLemma());
179+
assertEquals("sentiment", sentence1.getTagOccurrence(103).getLemma());
180+
assertEquals("forecast", sentence1.getTagOccurrence(133).getLemma());
181+
assertNull(sentence1.getTagOccurrence(184));
182+
183+
Sentence sentence2 = annotateText.getSentences().get(1);
184+
assertEquals("chart", sentence2.getTagOccurrence(184).getLemma());
185+
assertEquals("Figure", sentence2.getTagOccurrence(193).getLemma());
186+
}
152187
}

0 commit comments

Comments
 (0)