Skip to content
This repository was archived by the owner on May 6, 2021. It is now read-only.

Commit 93cfb36

Browse files
committed
added support for phrases occurrences in text processor, some more refactoring
1 parent 979f7f6 commit 93cfb36

File tree

7 files changed

+324
-114
lines changed

7 files changed

+324
-114
lines changed

src/main/java/com/graphaware/nlp/domain/Labels.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,6 @@
2323
public enum Labels implements Label {
2424
AnnotatedText,
2525
Sentence,
26-
Tag
26+
Tag,
27+
Phrase
2728
}

src/main/java/com/graphaware/nlp/domain/CypherStatement.java renamed to src/main/java/com/graphaware/nlp/domain/PartOfTextOccurrence.java

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,34 +15,23 @@
1515
*/
1616
package com.graphaware.nlp.domain;
1717

18-
import java.util.Map;
18+
import com.graphaware.common.util.Pair;
1919

20-
/**
21-
*
22-
* @author ale
23-
*/
24-
public class CypherStatement {
25-
private String query;
26-
private Map<String, Object> params;
20+
class PartOfTextOccurrence<T> {
2721

28-
public CypherStatement(String query, Map<String, Object> params) {
29-
this.query = query;
30-
this.params = params;
31-
}
22+
private final T element;
23+
private final Pair<Integer, Integer> span;
3224

33-
public String getQuery() {
34-
return query;
25+
public PartOfTextOccurrence(T element, int begin, int end) {
26+
this.element = element;
27+
this.span = new Pair<>(begin, end);
3528
}
3629

37-
public void setQuery(String query) {
38-
this.query = query;
30+
public T getElement() {
31+
return element;
3932
}
4033

41-
public Map<String, Object> getParams() {
42-
return params;
34+
public Pair<Integer, Integer> getSpan() {
35+
return span;
4336
}
44-
45-
public void setParams(Map<String, Object> params) {
46-
this.params = params;
47-
}
4837
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
* Copyright (c) 2013-2016 GraphAware
3+
*
4+
* This file is part of the GraphAware Framework.
5+
*
6+
* GraphAware Framework is free software: you can redistribute it and/or modify it under the terms of
7+
* the GNU General Public License as published by the Free Software Foundation, either
8+
* version 3 of the License, or (at your option) any later version.
9+
*
10+
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
11+
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12+
* See the GNU General Public License for more details. You should have received a copy of
13+
* the GNU General Public License along with this program. If not, see
14+
* <http://www.gnu.org/licenses/>.
15+
*/
16+
package com.graphaware.nlp.domain;
17+
18+
public class Phrase {
19+
private final String content;
20+
21+
public Phrase(String content) {
22+
this.content = content.trim();
23+
}
24+
25+
public String getContent() {
26+
return content;
27+
}
28+
29+
@Override
30+
public boolean equals(Object o) {
31+
if (o == null || !(o instanceof Phrase))
32+
return false;
33+
return this.content.equalsIgnoreCase(((Phrase)o).content);
34+
}
35+
36+
}

src/main/java/com/graphaware/nlp/domain/Sentence.java

Lines changed: 44 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,17 @@
1515
*/
1616
package com.graphaware.nlp.domain;
1717

18-
import com.graphaware.common.util.Pair;
1918
import static com.graphaware.nlp.domain.SentimentLabels.*;
2019
import static com.graphaware.nlp.domain.Labels.Sentence;
2120
import static com.graphaware.nlp.domain.Properties.HASH;
2221
import static com.graphaware.nlp.domain.Properties.PROPERTY_ID;
2322
import static com.graphaware.nlp.domain.Properties.TEXT;
2423
import static com.graphaware.nlp.domain.Relationships.HAS_TAG;
2524
import static com.graphaware.nlp.util.HashFunctions.MD5;
25+
import java.util.ArrayList;
2626
import java.util.Collection;
2727
import java.util.HashMap;
28+
import java.util.List;
2829
import java.util.Map;
2930
import org.neo4j.graphdb.GraphDatabaseService;
3031
import org.neo4j.graphdb.Node;
@@ -33,11 +34,15 @@
3334

3435
public class Sentence implements Persistable {
3536

37+
public static final int NO_SENTIMENT = -1;
38+
3639
private final Map<String, Tag> tags;
37-
private Map<Integer, TagOccurrence> tagOccurrences;
40+
private Map<Integer, PartOfTextOccurrence<Tag>> tagOccurrences;
41+
private Map<Integer, Map<Integer, PartOfTextOccurrence<Phrase>>> phraseOccurrences;
3842

3943
private final String sentence;
40-
private int sentiment = -1;
44+
private int sentiment = NO_SENTIMENT;
45+
4146
private boolean store = false;
4247
private String id;
4348

@@ -76,29 +81,61 @@ public String getId() {
7681
return id;
7782
}
7883

79-
public void addOccurrence(int begin, int end, Tag tag) {
84+
public void addTagOccurrence(int begin, int end, Tag tag) {
8085
if (begin < 0) {
8186
throw new RuntimeException("Begin cannot be negative (for tag: " + tag.getLemma() + ")" );
8287
}
8388
if (tagOccurrences == null) {
8489
tagOccurrences = new HashMap<>();
8590
}
8691
//Will update end if already exist
87-
tagOccurrences.put(begin, new TagOccurrence(tag, begin, end));
92+
tagOccurrences.put(begin, new PartOfTextOccurrence<>(tag, begin, end));
8893
}
8994

9095
//Currently used only for testing purpose
9196
public Tag getTagOccurrence(int begin) {
9297
if (begin < 0) {
9398
throw new RuntimeException("Begin cannot be negative");
9499
}
95-
TagOccurrence occurrence = tagOccurrences.get(begin);
100+
PartOfTextOccurrence<Tag> occurrence = tagOccurrences.get(begin);
96101
if (occurrence != null) {
97-
return occurrence.getTag();
102+
return occurrence.getElement();
98103
} else {
99104
return null;
100105
}
101106
}
107+
108+
public void addPhraseOccurrence(int begin, int end, Phrase phrase) {
109+
if (begin < 0) {
110+
throw new RuntimeException("Begin cannot be negative (for phrase: " + phrase.getContent()+ ")" );
111+
}
112+
if (phraseOccurrences == null) {
113+
phraseOccurrences = new HashMap<>();
114+
}
115+
if (!phraseOccurrences.containsKey(begin)) {
116+
phraseOccurrences.put(begin, new HashMap<>());
117+
}
118+
//Will update end if already exist
119+
phraseOccurrences.get(begin).put(end, new PartOfTextOccurrence<>(phrase, begin, end));
120+
}
121+
122+
//Currently used only for testing purpose
123+
public List<Phrase> getPhraseOccurrence(int begin) {
124+
if (begin < 0) {
125+
throw new RuntimeException("Begin cannot be negative");
126+
}
127+
Map<Integer, PartOfTextOccurrence<Phrase>> occurrence = phraseOccurrences.get(begin);
128+
129+
if (occurrence != null) {
130+
List<Phrase> result = new ArrayList<>();
131+
occurrence.values().stream().forEach((item) -> {
132+
result.add(item.getElement());
133+
});
134+
return result;
135+
} else {
136+
return new ArrayList<>();
137+
}
138+
}
102139

103140
@Override
104141
public Node storeOnGraph(GraphDatabaseService database) {
@@ -165,23 +202,4 @@ private Node checkIfExist(GraphDatabaseService database, Object id) {
165202
}
166203
return null;
167204
}
168-
169-
class TagOccurrence {
170-
171-
private final Tag tag;
172-
private final Pair<Integer, Integer> span;
173-
174-
public TagOccurrence(Tag tag, int begin, int end) {
175-
this.tag = tag;
176-
this.span = new Pair<>(begin, end);
177-
}
178-
179-
public Tag getTag() {
180-
return tag;
181-
}
182-
183-
public Pair<Integer, Integer> getSpan() {
184-
return span;
185-
}
186-
}
187205
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* To change this license header, choose License Headers in Project Properties.
3+
* To change this template file, choose Tools | Templates
4+
* and open the template in the editor.
5+
*/
6+
package com.graphaware.nlp.processor;
7+
8+
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
9+
import java.util.Properties;
10+
11+
public class PipelineBuilder {
12+
private static final String CUSTOM_STOP_WORD_LIST = "start,starts,period,periods,a,an,and,are,as,at,be,but,by,for,if,in,into,is,it,no,not,of,o,on,or,such,that,the,their,then,there,these,they,this,to,was,will,with";
13+
14+
private final Properties properties = new Properties();
15+
private final StringBuilder annotattors = new StringBuilder(); //basics annotators
16+
private int threadsNumber = 4;
17+
18+
public PipelineBuilder tokenize() {
19+
checkForExistingAnnotators();
20+
annotattors.append("tokenize, ssplit, pos, lemma, ner");
21+
return this;
22+
}
23+
24+
private void checkForExistingAnnotators() {
25+
if (annotattors.toString().length() > 0) {
26+
annotattors.append(", ");
27+
}
28+
}
29+
30+
public PipelineBuilder extractSentiment() {
31+
checkForExistingAnnotators();
32+
annotattors.append("parse, sentiment");
33+
return this;
34+
}
35+
36+
public PipelineBuilder extractRelations() {
37+
checkForExistingAnnotators();
38+
annotattors.append("relation");
39+
return this;
40+
}
41+
42+
public PipelineBuilder extractCoref() {
43+
checkForExistingAnnotators();
44+
annotattors.append("mention, coref");
45+
properties.setProperty("coref.doClustering", "true");
46+
properties.setProperty("coref.md.type", "rule");
47+
properties.setProperty("coref.mode", "statistical");
48+
return this;
49+
}
50+
51+
public PipelineBuilder defaultStopWordAnnotator() {
52+
checkForExistingAnnotators();
53+
annotattors.append("stopword");
54+
properties.setProperty("customAnnotatorClass.stopword", "com.graphaware.nlp.processor.StopwordAnnotator");
55+
properties.setProperty(StopwordAnnotator.STOPWORDS_LIST, CUSTOM_STOP_WORD_LIST);
56+
return this;
57+
}
58+
59+
public PipelineBuilder stopWordAnnotator(Properties properties) {
60+
properties.entrySet().stream().forEach((entry) -> {
61+
this.properties.setProperty((String) entry.getKey(), (String) entry.getValue());
62+
});
63+
return this;
64+
}
65+
66+
public PipelineBuilder threadNumber(int threads) {
67+
this.threadsNumber = threads;
68+
return this;
69+
}
70+
71+
public StanfordCoreNLP build() {
72+
properties.setProperty("annotators", annotattors.toString());
73+
properties.setProperty("threads", String.valueOf(threadsNumber));
74+
StanfordCoreNLP pipeline = new StanfordCoreNLP(properties);
75+
return pipeline;
76+
}
77+
}

0 commit comments

Comments
 (0)