Skip to content
This repository was archived by the owner on May 6, 2021. It is now read-only.

Commit 1fde6fe

Browse files
committed
further improvements in phrase processing
1 parent 93cfb36 commit 1fde6fe

File tree

4 files changed

+77
-15
lines changed

4 files changed

+77
-15
lines changed

src/main/java/com/graphaware/nlp/domain/Phrase.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
public class Phrase {
1919
private final String content;
20+
private Phrase reference;
2021

2122
public Phrase(String content) {
2223
this.content = content.trim();
@@ -32,5 +33,12 @@ public boolean equals(Object o) {
3233
return false;
3334
return this.content.equalsIgnoreCase(((Phrase)o).content);
3435
}
35-
36+
37+
public Phrase getReference() {
38+
return reference;
39+
}
40+
41+
public void setReference(Phrase reference) {
42+
this.reference = reference;
43+
}
3644
}

src/main/java/com/graphaware/nlp/domain/Sentence.java

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,14 @@
3535
public class Sentence implements Persistable {
3636

3737
public static final int NO_SENTIMENT = -1;
38-
38+
3939
private final Map<String, Tag> tags;
4040
private Map<Integer, PartOfTextOccurrence<Tag>> tagOccurrences;
4141
private Map<Integer, Map<Integer, PartOfTextOccurrence<Phrase>>> phraseOccurrences;
4242

4343
private final String sentence;
4444
private int sentiment = NO_SENTIMENT;
45-
45+
4646
private boolean store = false;
4747
private String id;
4848

@@ -83,7 +83,7 @@ public String getId() {
8383

8484
public void addTagOccurrence(int begin, int end, Tag tag) {
8585
if (begin < 0) {
86-
throw new RuntimeException("Begin cannot be negative (for tag: " + tag.getLemma() + ")" );
86+
throw new RuntimeException("Begin cannot be negative (for tag: " + tag.getLemma() + ")");
8787
}
8888
if (tagOccurrences == null) {
8989
tagOccurrences = new HashMap<>();
@@ -101,13 +101,13 @@ public Tag getTagOccurrence(int begin) {
101101
if (occurrence != null) {
102102
return occurrence.getElement();
103103
} else {
104-
return null;
104+
return null;
105105
}
106106
}
107-
107+
108108
public void addPhraseOccurrence(int begin, int end, Phrase phrase) {
109109
if (begin < 0) {
110-
throw new RuntimeException("Begin cannot be negative (for phrase: " + phrase.getContent()+ ")" );
110+
throw new RuntimeException("Begin cannot be negative (for phrase: " + phrase.getContent() + ")");
111111
}
112112
if (phraseOccurrences == null) {
113113
phraseOccurrences = new HashMap<>();
@@ -125,16 +125,28 @@ public List<Phrase> getPhraseOccurrence(int begin) {
125125
throw new RuntimeException("Begin cannot be negative");
126126
}
127127
Map<Integer, PartOfTextOccurrence<Phrase>> occurrence = phraseOccurrences.get(begin);
128-
128+
129129
if (occurrence != null) {
130130
List<Phrase> result = new ArrayList<>();
131131
occurrence.values().stream().forEach((item) -> {
132132
result.add(item.getElement());
133133
});
134134
return result;
135135
} else {
136-
return new ArrayList<>();
136+
return new ArrayList<>();
137+
}
138+
}
139+
140+
public Phrase getPhraseOccurrence(int begin, int end) {
141+
if (begin < 0) {
142+
throw new RuntimeException("Begin cannot be negative");
137143
}
144+
Map<Integer, PartOfTextOccurrence<Phrase>> occurrences = phraseOccurrences.get(begin);
145+
146+
if (occurrences != null && occurrences.containsKey(end)) {
147+
return occurrences.get(end).getElement();
148+
}
149+
return null;
138150
}
139151

140152
@Override

src/main/java/com/graphaware/nlp/processor/TextProcessor.java

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
import com.graphaware.nlp.domain.Phrase;
2020
import com.graphaware.nlp.domain.Sentence;
2121
import com.graphaware.nlp.domain.Tag;
22+
import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
23+
import edu.stanford.nlp.hcoref.data.CorefChain;
2224
import edu.stanford.nlp.ling.CoreAnnotations;
2325
import edu.stanford.nlp.ling.CoreLabel;
2426
import edu.stanford.nlp.ling.Word;
@@ -98,16 +100,16 @@ private void createCompletePipeline() {
98100
public AnnotatedText annotateText(String text, Object id, boolean sentiment, boolean store) {
99101
StanfordCoreNLP pipeline;
100102
if (sentiment) {
101-
pipeline = pipelines.get(PIPELINE.COMPLETE);
103+
pipeline = pipelines.get(PIPELINE.COMPLETE);
102104
} else {
103105
pipeline = pipelines.get(PIPELINE.BASIC);
104106
}
105107
return annotateText(text, id, pipeline, store);
106108
}
107-
108-
public AnnotatedText annotateText(String text, Object id, StanfordCoreNLP pipeline, boolean store) {
109+
110+
public AnnotatedText annotateText(String text, Object id, StanfordCoreNLP pipeline, boolean store) {
109111
AnnotatedText result = new AnnotatedText(id);
110-
Annotation document = new Annotation(text);
112+
Annotation document = new Annotation(text);
111113
pipeline.annotate(document);
112114
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
113115
final AtomicInteger sentenceSequence = new AtomicInteger(0);
@@ -121,13 +123,15 @@ public AnnotatedText annotateText(String text, Object id, StanfordCoreNLP pipeli
121123
extractPhrases(sentence, newSentence);
122124
result.addSentence(newSentence);
123125
});
126+
extractRelationship(result, sentences, document);
124127
return result;
125128
}
126129

127130
protected void extractPhrases(CoreMap sentence, Sentence newSentence) {
128131
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
129-
if (tree == null)
132+
if (tree == null) {
130133
return;
134+
}
131135
Set<PhraseHolder> extractedPhrases = inspectSubTree(tree);
132136
extractedPhrases.stream().forEach((holder) -> {
133137
newSentence.addPhraseOccurrence(holder.getBeginPosition(), holder.getEndPosition(), new Phrase(holder.getPhrase()));
@@ -199,6 +203,41 @@ protected void extractTokens(CoreMap sentence, final Sentence newSentence) {
199203
}
200204
}
201205

206+
private void extractRelationship(AnnotatedText annotatedText, List<CoreMap> sentences, Annotation document) {
207+
Map<Integer, CorefChain> corefChains = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
208+
if (corefChains != null) {
209+
for (CorefChain chain : corefChains.values()) {
210+
CorefChain.CorefMention representative = chain.getRepresentativeMention();
211+
int representativeSenteceNumber = representative.sentNum - 1;
212+
List<CoreLabel> representativeTokens = sentences.get(representativeSenteceNumber).get(CoreAnnotations.TokensAnnotation.class);
213+
int beginPosition = representativeTokens.get(representative.startIndex - 1).beginPosition();
214+
int endPosition = representativeTokens.get(representative.endIndex - 1).endPosition();
215+
Phrase representativePhraseOccurrence = annotatedText.getSentences().get(representativeSenteceNumber).getPhraseOccurrence(beginPosition, endPosition);
216+
if (representativePhraseOccurrence == null) {
217+
LOG.warn("Representative Phrase not found: " + representative.mentionSpan);
218+
}
219+
for (CorefChain.CorefMention mention : chain.getMentionsInTextualOrder()) {
220+
if (mention == representative) {
221+
continue;
222+
}
223+
int mentionSentenceNumber = mention.sentNum - 1;
224+
225+
List<CoreLabel> mentionTokens = sentences.get(mentionSentenceNumber).get(CoreAnnotations.TokensAnnotation.class);
226+
int beginPositionMention = mentionTokens.get(mention.startIndex - 1).beginPosition();
227+
int endPositionMention = mentionTokens.get(mention.endIndex - 1).endPosition();
228+
Phrase mentionPhraseOccurrence = annotatedText.getSentences().get(representativeSenteceNumber).getPhraseOccurrence(beginPositionMention, endPositionMention);
229+
if (mentionPhraseOccurrence == null) {
230+
LOG.warn("Mention Phrase not found: " + mention.mentionSpan);
231+
}
232+
if (representativePhraseOccurrence != null
233+
&& mentionPhraseOccurrence != null) {
234+
mentionPhraseOccurrence.setReference(representativePhraseOccurrence);
235+
}
236+
}
237+
}
238+
}
239+
}
240+
202241
public AnnotatedText sentiment(AnnotatedText annotated) {
203242
StanfordCoreNLP pipeline = pipelines.get(PIPELINE.SENTIMENT);
204243
annotated.getSentences().parallelStream().forEach((item) -> {
@@ -216,8 +255,9 @@ public AnnotatedText sentiment(AnnotatedText annotated) {
216255
private int extractSentiment(CoreMap sentence) {
217256
Tree tree = sentence
218257
.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
219-
if (tree == null)
258+
if (tree == null) {
220259
return Sentence.NO_SENTIMENT;
260+
}
221261
int score = RNNCoreAnnotations.getPredictedClass(tree);
222262
return score;
223263
}

src/test/java/com/graphaware/nlp/processor/TextProcessorTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,8 @@ public void testAnnotatedTextWithPosition() {
194194
assertTrue(sentence1.getPhraseOccurrence(103).contains(new Phrase("Sentiment")));
195195
assertTrue(sentence1.getPhraseOccurrence(113).contains(new Phrase("Analysis")));
196196

197+
//his(76)-> the third author(54)
198+
//assertTrue(sentence1.getPhraseOccurrence(76).get(1).getReference().getContent().equalsIgnoreCase("the third author"));
197199
Sentence sentence2 = annotateText.getSentences().get(1);
198200
assertEquals("chart", sentence2.getTagOccurrence(184).getLemma());
199201
assertEquals("Figure", sentence2.getTagOccurrence(193).getLemma());

0 commit comments

Comments
 (0)