added support for tag occurrences in text processor

alenegro81 · alenegro81 · commit 979f7f62199c · 2016-07-30T18:22:09.000+02:00
diff --git a/src/main/java/com/graphaware/nlp/domain/Sentence.java b/src/main/java/com/graphaware/nlp/domain/Sentence.java
@@ -15,6 +15,7 @@
  */
 package com.graphaware.nlp.domain;
 
+import com.graphaware.common.util.Pair;
 import static com.graphaware.nlp.domain.SentimentLabels.*;
 import static com.graphaware.nlp.domain.Labels.Sentence;
 import static com.graphaware.nlp.domain.Properties.HASH;
@@ -33,6 +34,8 @@
 public class Sentence implements Persistable {
 
     private final Map<String, Tag> tags;
+    private Map<Integer, TagOccurrence> tagOccurrences;
+
     private final String sentence;
     private int sentiment = -1;
     private boolean store = false;
@@ -73,6 +76,30 @@ public String getId() {
         return id;
     }
 
+    public void addOccurrence(int begin, int end, Tag tag) {
+        if (begin < 0) {
+            throw new RuntimeException("Begin cannot be negative (for tag: " + tag.getLemma() + ")" );
+        }
+        if (tagOccurrences == null) {
+            tagOccurrences = new HashMap<>();
+        }
+        //Will update end if already exist
+        tagOccurrences.put(begin, new TagOccurrence(tag, begin, end));
+    }
+
+    //Currently used only for testing purpose
+    public Tag getTagOccurrence(int begin) {
+        if (begin < 0) {
+            throw new RuntimeException("Begin cannot be negative");
+        }
+        TagOccurrence occurrence = tagOccurrences.get(begin);
+        if (occurrence != null) {
+            return occurrence.getTag();
+        } else {
+          return null;  
+        }
+    }
+
     @Override
     public Node storeOnGraph(GraphDatabaseService database) {
         Node sequenceNode = checkIfExist(database, id);
@@ -138,4 +165,23 @@ private Node checkIfExist(GraphDatabaseService database, Object id) {
         }
         return null;
     }
+
+    class TagOccurrence {
+
+        private final Tag tag;
+        private final Pair<Integer, Integer> span;
+
+        public TagOccurrence(Tag tag, int begin, int end) {
+            this.tag = tag;
+            this.span = new Pair<>(begin, end);
+        }
+
+        public Tag getTag() {
+            return tag;
+        }
+
+        public Pair<Integer, Integer> getSpan() {
+            return span;
+        }
+    }
 }
diff --git a/src/main/java/com/graphaware/nlp/processor/TextProcessor.java b/src/main/java/com/graphaware/nlp/processor/TextProcessor.java
@@ -101,72 +101,92 @@ public AnnotatedText annotateText(String text, Object id, boolean sentiment, boo
             pipelines.get(PIPELINE.BASIC).annotate(document);
         }
         List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
-        final String background = backgroundSymbol;
         final AtomicInteger sentenceSequence = new AtomicInteger(0);
         sentences.stream().map((sentence) -> {
             return sentence;
         }).forEach((sentence) -> {
             String sentenceId = id + "_" + sentenceSequence.getAndIncrement();
             final Sentence newSentence = new Sentence(sentence.toString(), store, sentenceId);
-            final AtomicReference<String> prevNe = new AtomicReference<>();
-            prevNe.set(background);
-            final AtomicReference<StringBuilder> sb = new AtomicReference<>();
-            sb.set(new StringBuilder());
-            List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
-            tokens.stream()
-                    .filter((token) -> (token != null) && checkPuntuation(token.get(CoreAnnotations.LemmaAnnotation.class)))
-                    .map((token) -> {
-                        //
-                        String currentNe = StringUtils.getNotNullString(token.get(CoreAnnotations.NamedEntityTagAnnotation.class));
-                        if (currentNe.equals(background) && prevNe.get().equals(background)) {
-                            Tag tag = getTag(token);
-                            if (tag != null) {
-                                newSentence.addTag(tag);
-                            }
-                        } else if (currentNe.equals(background) && !prevNe.get().equals(background)) {
-                            Tag newTag = new Tag(sb.get().toString());
-                            newTag.setNe(prevNe.get());
-                            newSentence.addTag(newTag);
-                            sb.set(new StringBuilder());
-                            Tag tag = getTag(token);
-                            if (tag != null) {
-                                newSentence.addTag(tag);
-                            }
-                        } else if (!currentNe.equals(prevNe.get()) && !prevNe.get().equals(background)) {
-                            Tag newTag = new Tag(sb.get().toString());
-                            newTag.setNe(prevNe.get());
-                            newSentence.addTag(newTag);
-                            sb.set(new StringBuilder());
-                            sb.get().append(StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class)));
-                        } else if (!currentNe.equals(background) && prevNe.get().equals(background)) {
-                            sb.get().append(StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class)));
-                        } else {
-                            String before = StringUtils.getNotNullString(token.get(CoreAnnotations.BeforeAnnotation.class));
-                            String currentText = StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class));
-                            sb.get().append(before);
-                            sb.get().append(currentText);
-                        }
-                        return currentNe;
-                    }).forEach((currentNe) -> {
-                prevNe.set(currentNe);
-            });
-
-            if (sb.get().length() > 0) {
-                Tag tag = new Tag(sb.get().toString());
-                tag.setNe(prevNe.get());
-                newSentence.addTag(tag);
-            }
-
+            extractTokens(sentence, newSentence);
             if (sentiment) {
-                int score = extractSentiment(sentence);
-                newSentence.setSentiment(score);
+                extractSentiment(sentence, newSentence);
             }
             result.addSentence(newSentence);
 
         });
         return result;
     }
 
+    protected void extractSentiment(CoreMap sentence, final Sentence newSentence) {
+        int score = extractSentiment(sentence);
+        newSentence.setSentiment(score);
+    }
+
+    protected void extractTokens(CoreMap sentence, final Sentence newSentence) {
+        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+        final String background = backgroundSymbol;
+//      final AtomicReference<String> prevNe = new AtomicReference<>();
+//      prevNe.set(background);
+//      final AtomicReference<StringBuilder> sb = new AtomicReference<>();
+//      sb.set(new StringBuilder());
+        TokenHolder currToken = new TokenHolder();
+        currToken.setNe(background);
+        tokens.stream()
+                .filter((token) -> (token != null) && checkPuntuation(token.get(CoreAnnotations.LemmaAnnotation.class)))
+                .map((token) -> {
+                    //
+                    String currentNe = StringUtils.getNotNullString(token.get(CoreAnnotations.NamedEntityTagAnnotation.class));
+                    if (currentNe.equals(background) && currToken.getNe().equals(background)) {
+                        Tag tag = getTag(token);
+                        if (tag != null) {
+                            newSentence.addTag(tag);
+                            newSentence.addOccurrence(token.beginPosition(), token.endPosition(), tag);
+                        }
+                    } else if (currentNe.equals(background) && !currToken.getNe().equals(background)) {
+                        Tag newTag = new Tag(currToken.getToken());
+                        newTag.setNe(currToken.getNe());
+                        newSentence.addTag(newTag);
+                        newSentence.addOccurrence(currToken.getBeginPosition(), currToken.getEndPosition(), newTag);
+                        currToken.reset();
+                        Tag tag = getTag(token);
+                        if (tag != null) {
+                            newSentence.addTag(tag);
+                            newSentence.addOccurrence(token.beginPosition(), token.endPosition(), tag);
+                        }
+                    } else if (!currentNe.equals(currToken.getNe()) && !currToken.getNe().equals(background)) {
+                        Tag tag = new Tag(currToken.getToken());
+                        tag.setNe(currToken.getNe());
+                        newSentence.addTag(tag);
+                        newSentence.addOccurrence(currToken.getBeginPosition(), currToken.getEndPosition(), tag);
+                        currToken.reset();
+                        currToken.updateToken(StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class)));
+                        currToken.setBeginPosition(token.beginPosition());
+                        currToken.setEndPosition(token.endPosition());
+                    } else if (!currentNe.equals(background) && currToken.getNe().equals(background)) {
+                        currToken.updateToken(StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class)));
+                        currToken.setBeginPosition(token.beginPosition());
+                        currToken.setEndPosition(token.endPosition());
+                    } else {
+                        String before = StringUtils.getNotNullString(token.get(CoreAnnotations.BeforeAnnotation.class));
+                        String currentText = StringUtils.getNotNullString(token.get(CoreAnnotations.OriginalTextAnnotation.class));
+                        currToken.updateToken(before);
+                        currToken.updateToken(currentText);
+                        currToken.setBeginPosition(token.beginPosition());
+                        currToken.setEndPosition(token.endPosition());
+                    }
+                    return currentNe;
+                }).forEach((currentNe) -> {
+            currToken.setNe(currentNe);
+        });
+
+        if (currToken.getToken().length() > 0) {
+            Tag tag = new Tag(currToken.getToken());
+            tag.setNe(currToken.getNe());
+            newSentence.addTag(tag);
+            newSentence.addOccurrence(currToken.getBeginPosition(), currToken.getEndPosition(), tag);
+        }
+    }
+
     public AnnotatedText sentiment(AnnotatedText annotated) {
         StanfordCoreNLP pipeline = pipelines.get(PIPELINE.SENTIMENT);
         annotated.getSentences().parallelStream().forEach((item) -> {
@@ -175,8 +195,7 @@ public AnnotatedText sentiment(AnnotatedText annotated) {
             List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
             Optional<CoreMap> sentence = sentences.stream().findFirst();
             if (sentence != null && sentence.isPresent()) {
-                int score = extractSentiment(sentence.get());
-                item.setSentiment(score);
+                extractSentiment(sentence.get(), item);
             }
         });
         return annotated;
@@ -250,6 +269,61 @@ public boolean checkPuntuation(String value) {
         return !match.find();
     }
 
+    class TokenHolder {
+
+        private String ne;
+        private StringBuilder sb;
+        private int beginPosition;
+        private int endPosition;
+
+        public TokenHolder() {
+            reset();
+        }
+
+        public String getNe() {
+            return ne;
+        }
+
+        public String getToken() {
+            if (sb == null) {
+                return " - ";
+            }
+            return sb.toString();
+        }
+
+        public int getBeginPosition() {
+            return beginPosition;
+        }
+
+        public int getEndPosition() {
+            return endPosition;
+        }
+
+        public void setNe(String ne) {
+            this.ne = ne;
+        }
+
+        public void updateToken(String tknStr) {
+            this.sb.append(tknStr);
+        }
+
+        public void setBeginPosition(int beginPosition) {
+            if (this.beginPosition < 0) {
+                this.beginPosition = beginPosition;
+            }
+        }
+
+        public void setEndPosition(int endPosition) {
+            this.endPosition = endPosition;
+        }
+
+        public final void reset() {
+            sb = new StringBuilder();
+            beginPosition = -1;
+            endPosition = -1;
+        }
+    }
+
     static class PipelineBuilder {
 
         private final Properties properties = new Properties();
diff --git a/src/test/java/com/graphaware/nlp/processor/TextProcessorTest.java b/src/test/java/com/graphaware/nlp/processor/TextProcessorTest.java
@@ -17,6 +17,7 @@
 
 import com.graphaware.nlp.conceptnet5.ConceptNet5Importer;
 import com.graphaware.nlp.domain.AnnotatedText;
+import com.graphaware.nlp.domain.Sentence;
 import com.graphaware.nlp.domain.Tag;
 import com.graphaware.nlp.persistence.GraphPersistence;
 import com.graphaware.nlp.persistence.LocalGraphDatabase;
@@ -27,6 +28,7 @@
 import java.util.Map;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
 import org.junit.Test;
 import org.neo4j.graphdb.Node;
 import org.neo4j.graphdb.QueryExecutionException;
@@ -149,4 +151,37 @@ public void testSentiment() {
         assertEquals(1, annotateText.getSentences().size());
         assertEquals(4, annotateText.getSentences().get(0).getSentiment());
     }
+    
+    @Test
+    public void testAnnotatedTextWithPosition() {
+        TextProcessor textProcessor = new TextProcessor();
+        AnnotatedText annotateText = textProcessor.annotateText("On 8 May 2013, "
+                + "one week before the Pakistani election, the third author, "
+                + "in his keynote address at the Sentiment Analysis Symposium, "
+                + "forecast the winner of the Pakistani election. The chart "
+                + "in Figure 1 shows varying sentiment on the candidates for "
+                + "prime minister of Pakistan in that election. The next day, "
+                + "the BBC’s Owen Bennett Jones, reporting from Islamabad, wrote "
+                + "an article titled “Pakistan Elections: Five Reasons Why the "
+                + "Vote is Unpredictable,”1 in which he claimed that the election "
+                + "was too close to call. It was not, and despite his being in Pakistan, "
+                + "the outcome of the election was exactly as we predicted.", 1, false, false);
+
+        assertEquals(4, annotateText.getSentences().size());
+        Sentence sentence1 = annotateText.getSentences().get(0);
+        assertEquals(15, sentence1.getTags().size());
+        
+        assertNull(sentence1.getTagOccurrence(0));
+        assertEquals("8 May 2013", sentence1.getTagOccurrence(3).getLemma());
+        assertEquals("one week", sentence1.getTagOccurrence(15).getLemma());
+        assertEquals("before", sentence1.getTagOccurrence(24).getLemma());
+        assertEquals("third", sentence1.getTagOccurrence(59).getLemma());
+        assertEquals("sentiment", sentence1.getTagOccurrence(103).getLemma());
+        assertEquals("forecast", sentence1.getTagOccurrence(133).getLemma());
+        assertNull(sentence1.getTagOccurrence(184));
+        
+        Sentence sentence2 = annotateText.getSentences().get(1);
+        assertEquals("chart", sentence2.getTagOccurrence(184).getLemma());
+        assertEquals("Figure", sentence2.getTagOccurrence(193).getLemma());
+    }
 }