further improvements in phrase processing

alenegro81 · alenegro81 · commit 1fde6fe2a26d · 2016-08-02T16:28:34.000+02:00
diff --git a/src/main/java/com/graphaware/nlp/domain/Phrase.java b/src/main/java/com/graphaware/nlp/domain/Phrase.java
@@ -17,6 +17,7 @@
 
 public class Phrase {
     private final String content;
+    private Phrase reference;
 
     public Phrase(String content) {
         this.content = content.trim();
@@ -32,5 +33,12 @@ public boolean equals(Object o) {
             return false;
         return this.content.equalsIgnoreCase(((Phrase)o).content);
     }
-    
+
+    public Phrase getReference() {
+        return reference;
+    }
+
+    public void setReference(Phrase reference) {
+        this.reference = reference;
+    }
 }
diff --git a/src/main/java/com/graphaware/nlp/domain/Sentence.java b/src/main/java/com/graphaware/nlp/domain/Sentence.java
@@ -35,14 +35,14 @@
 public class Sentence implements Persistable {
 
     public static final int NO_SENTIMENT = -1;
-    
+
     private final Map<String, Tag> tags;
     private Map<Integer, PartOfTextOccurrence<Tag>> tagOccurrences;
     private Map<Integer, Map<Integer, PartOfTextOccurrence<Phrase>>> phraseOccurrences;
 
     private final String sentence;
     private int sentiment = NO_SENTIMENT;
-    
+
     private boolean store = false;
     private String id;
 
@@ -83,7 +83,7 @@ public String getId() {
 
     public void addTagOccurrence(int begin, int end, Tag tag) {
         if (begin < 0) {
-            throw new RuntimeException("Begin cannot be negative (for tag: " + tag.getLemma() + ")" );
+            throw new RuntimeException("Begin cannot be negative (for tag: " + tag.getLemma() + ")");
         }
         if (tagOccurrences == null) {
             tagOccurrences = new HashMap<>();
@@ -101,13 +101,13 @@ public Tag getTagOccurrence(int begin) {
         if (occurrence != null) {
             return occurrence.getElement();
         } else {
-          return null;  
+            return null;
         }
     }
-    
+
     public void addPhraseOccurrence(int begin, int end, Phrase phrase) {
         if (begin < 0) {
-            throw new RuntimeException("Begin cannot be negative (for phrase: " + phrase.getContent()+ ")" );
+            throw new RuntimeException("Begin cannot be negative (for phrase: " + phrase.getContent() + ")");
         }
         if (phraseOccurrences == null) {
             phraseOccurrences = new HashMap<>();
@@ -125,16 +125,28 @@ public List<Phrase> getPhraseOccurrence(int begin) {
             throw new RuntimeException("Begin cannot be negative");
         }
         Map<Integer, PartOfTextOccurrence<Phrase>> occurrence = phraseOccurrences.get(begin);
-        
+
         if (occurrence != null) {
             List<Phrase> result = new ArrayList<>();
             occurrence.values().stream().forEach((item) -> {
                 result.add(item.getElement());
             });
             return result;
         } else {
-          return new ArrayList<>();  
+            return new ArrayList<>();
+        }
+    }
+
+    public Phrase getPhraseOccurrence(int begin, int end) {
+        if (begin < 0) {
+            throw new RuntimeException("Begin cannot be negative");
         }
+        Map<Integer, PartOfTextOccurrence<Phrase>> occurrences = phraseOccurrences.get(begin);
+
+        if (occurrences != null && occurrences.containsKey(end)) {
+                return occurrences.get(end).getElement();
+        }
+        return null;
     }
 
     @Override
diff --git a/src/main/java/com/graphaware/nlp/processor/TextProcessor.java b/src/main/java/com/graphaware/nlp/processor/TextProcessor.java
@@ -19,6 +19,8 @@
 import com.graphaware.nlp.domain.Phrase;
 import com.graphaware.nlp.domain.Sentence;
 import com.graphaware.nlp.domain.Tag;
+import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
+import edu.stanford.nlp.hcoref.data.CorefChain;
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.Word;
@@ -98,16 +100,16 @@ private void createCompletePipeline() {
     public AnnotatedText annotateText(String text, Object id, boolean sentiment, boolean store) {
         StanfordCoreNLP pipeline;
         if (sentiment) {
-             pipeline = pipelines.get(PIPELINE.COMPLETE);
+            pipeline = pipelines.get(PIPELINE.COMPLETE);
         } else {
             pipeline = pipelines.get(PIPELINE.BASIC);
         }
         return annotateText(text, id, pipeline, store);
     }
-    
-    public AnnotatedText annotateText(String text, Object id, StanfordCoreNLP pipeline, boolean store) {        
+
+    public AnnotatedText annotateText(String text, Object id, StanfordCoreNLP pipeline, boolean store) {
         AnnotatedText result = new AnnotatedText(id);
-        Annotation document = new Annotation(text);        
+        Annotation document = new Annotation(text);
         pipeline.annotate(document);
         List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
         final AtomicInteger sentenceSequence = new AtomicInteger(0);
@@ -121,13 +123,15 @@ public AnnotatedText annotateText(String text, Object id, StanfordCoreNLP pipeli
             extractPhrases(sentence, newSentence);
             result.addSentence(newSentence);
         });
+        extractRelationship(result, sentences, document);
         return result;
     }
 
     protected void extractPhrases(CoreMap sentence, Sentence newSentence) {
         Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
-        if (tree == null)
+        if (tree == null) {
             return;
+        }
         Set<PhraseHolder> extractedPhrases = inspectSubTree(tree);
         extractedPhrases.stream().forEach((holder) -> {
             newSentence.addPhraseOccurrence(holder.getBeginPosition(), holder.getEndPosition(), new Phrase(holder.getPhrase()));
@@ -199,6 +203,41 @@ protected void extractTokens(CoreMap sentence, final Sentence newSentence) {
         }
     }
 
+    private void extractRelationship(AnnotatedText annotatedText, List<CoreMap> sentences, Annotation document) {
+        Map<Integer, CorefChain> corefChains = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
+        if (corefChains != null) {
+            for (CorefChain chain : corefChains.values()) {
+                CorefChain.CorefMention representative = chain.getRepresentativeMention();
+                int representativeSenteceNumber = representative.sentNum - 1;
+                List<CoreLabel> representativeTokens = sentences.get(representativeSenteceNumber).get(CoreAnnotations.TokensAnnotation.class);
+                int beginPosition = representativeTokens.get(representative.startIndex - 1).beginPosition();
+                int endPosition = representativeTokens.get(representative.endIndex - 1).endPosition();
+                Phrase representativePhraseOccurrence = annotatedText.getSentences().get(representativeSenteceNumber).getPhraseOccurrence(beginPosition, endPosition);
+                if (representativePhraseOccurrence == null) {
+                    LOG.warn("Representative Phrase not found: " + representative.mentionSpan);
+                }
+                for (CorefChain.CorefMention mention : chain.getMentionsInTextualOrder()) {
+                    if (mention == representative) {
+                        continue;
+                    }
+                    int mentionSentenceNumber = mention.sentNum - 1;
+
+                    List<CoreLabel> mentionTokens = sentences.get(mentionSentenceNumber).get(CoreAnnotations.TokensAnnotation.class);
+                    int beginPositionMention = mentionTokens.get(mention.startIndex - 1).beginPosition();
+                    int endPositionMention = mentionTokens.get(mention.endIndex - 1).endPosition();
+                    Phrase mentionPhraseOccurrence = annotatedText.getSentences().get(representativeSenteceNumber).getPhraseOccurrence(beginPositionMention, endPositionMention);
+                    if (mentionPhraseOccurrence == null) {
+                        LOG.warn("Mention Phrase not found: " + mention.mentionSpan);
+                    }
+                    if (representativePhraseOccurrence != null 
+                            && mentionPhraseOccurrence != null) {
+                        mentionPhraseOccurrence.setReference(representativePhraseOccurrence);
+                    }                    
+                }
+            }
+        }
+    }
+
     public AnnotatedText sentiment(AnnotatedText annotated) {
         StanfordCoreNLP pipeline = pipelines.get(PIPELINE.SENTIMENT);
         annotated.getSentences().parallelStream().forEach((item) -> {
@@ -216,8 +255,9 @@ public AnnotatedText sentiment(AnnotatedText annotated) {
     private int extractSentiment(CoreMap sentence) {
         Tree tree = sentence
                 .get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
-        if (tree == null) 
+        if (tree == null) {
             return Sentence.NO_SENTIMENT;
+        }
         int score = RNNCoreAnnotations.getPredictedClass(tree);
         return score;
     }
diff --git a/src/test/java/com/graphaware/nlp/processor/TextProcessorTest.java b/src/test/java/com/graphaware/nlp/processor/TextProcessorTest.java
@@ -194,6 +194,8 @@ public void testAnnotatedTextWithPosition() {
         assertTrue(sentence1.getPhraseOccurrence(103).contains(new Phrase("Sentiment")));
         assertTrue(sentence1.getPhraseOccurrence(113).contains(new Phrase("Analysis")));
         
+        //his(76)-> the third author(54)
+        //assertTrue(sentence1.getPhraseOccurrence(76).get(1).getReference().getContent().equalsIgnoreCase("the third author"));
         Sentence sentence2 = annotateText.getSentences().get(1);
         assertEquals("chart", sentence2.getTagOccurrence(184).getLemma());
         assertEquals("Figure", sentence2.getTagOccurrence(193).getLemma());