Skip to content

Commit 7515999

Browse files
committed
Initial attempt at connecting the Semgrex parser to a uniq operation
1 parent 97ed9fc commit 7515999

File tree

3 files changed

+179
-26
lines changed

3 files changed

+179
-26
lines changed

src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.jj

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,10 @@ SemgrexPattern Root() : {
119119
throw new SemgrexParseException("Semgrex pattern asked for uniq of node " + key + " which does not exist in the pattern");
120120
}
121121
}
122-
// TODO: do shit here
122+
// TODO: can error check that the keys are unique between node and edge names
123+
// that might require keeping edge names in a known set
124+
// TODO: edge names might need some upgrades anyway - shouldn't name them under negation, for example
125+
node = new UniqPattern(node, uniqKeys);
123126
}
124127
)?
125128
)
@@ -279,7 +282,7 @@ SemgrexPattern ModNode(GraphRelation r) : {
279282
} {
280283
( child = Child(r)
281284
| ( "!"
282-
{ startUnderNeg = underNodeNegation;
285+
{ startUnderNeg = underNodeNegation; // TODO: can negations be nested? If so, should they cancel?
283286
underNodeNegation = true; } child = Child(r) { underNodeNegation = startUnderNeg; } )
284287
)
285288
{

src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,47 @@ public SemgrexMatcher matcher(SemanticGraph hypGraph, Alignment alignment, Seman
335335
return matcher(hypGraph, alignment, txtGraph, true, hypGraph.getFirstRoot(), new LinkedHashMap<>(), new LinkedHashMap<>(), new LinkedHashMap<>(), new VariableStrings(), ignoreCase);
336336
}
337337

338+
// batch processing
339+
// -------------------------------------------------------------
340+
/**
341+
* Postprocess a set of results from the batch processing method
342+
*
343+
* TODO: make abstract
344+
*/
345+
public List<Pair<CoreMap, List<SemgrexMatch>>> postprocessMatches(List<Pair<CoreMap, List<SemgrexMatch>>> matches) {
346+
return matches;
347+
}
348+
349+
/**
350+
* Returns a list of matching sentences and each of the matches from those sentences.
351+
*<br>
352+
* Non-matching sentences are currently not returned (may change in the future to return an empty list).
353+
*/
354+
public List<Pair<CoreMap, List<SemgrexMatch>>> matchSentences(List<CoreMap> sentences) {
355+
List<Pair<CoreMap, List<SemgrexMatch>>> matches = new ArrayList<>();
356+
for (CoreMap sentence : sentences) {
357+
SemanticGraph graph = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
358+
SemanticGraph enhanced = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
359+
SemgrexMatcher matcher = matcher(graph);
360+
if ( ! matcher.find()) {
361+
continue;
362+
}
363+
matches.add(new Pair<>(sentence, new ArrayList<>()));
364+
boolean found = true;
365+
while (found) {
366+
matches.get(matches.size() - 1).second().add(new SemgrexMatch(this, matcher));
367+
found = matcher.find();
368+
}
369+
}
370+
371+
for (SemgrexPattern child : getChildren()) {
372+
matches = child.postprocessMatches(matches);
373+
}
374+
matches = postprocessMatches(matches);
375+
376+
return matches;
377+
}
378+
338379
// compile method
339380
// -------------------------------------------------------------
340381

@@ -444,30 +485,6 @@ public enum OutputFormat {
444485
CONLLU
445486
}
446487

447-
/**
448-
* Returns a list of matching sentences and each of the matches from those sentences.
449-
*<br>
450-
* Non-matching sentences are currently not returned (may change in the future to return an empty list).
451-
*/
452-
public List<Pair<CoreMap, List<SemgrexMatch>>> matchSentences(List<CoreMap> sentences) {
453-
List<Pair<CoreMap, List<SemgrexMatch>>> matches = new ArrayList<>();
454-
for (CoreMap sentence : sentences) {
455-
SemanticGraph graph = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
456-
SemanticGraph enhanced = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
457-
SemgrexMatcher matcher = matcher(graph);
458-
if ( ! matcher.find()) {
459-
continue;
460-
}
461-
matches.add(new Pair<>(sentence, new ArrayList<>()));
462-
boolean found = true;
463-
while (found) {
464-
matches.get(matches.size() - 1).second().add(new SemgrexMatch(this, matcher));
465-
found = matcher.find();
466-
}
467-
}
468-
return matches;
469-
}
470-
471488
private static final String PATTERN = "-pattern";
472489
private static final String TREE_FILE = "-treeFile";
473490
private static final String MODE = "-mode";
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
package edu.stanford.nlp.semgraph.semgrex;
2+
3+
import java.util.ArrayList;
4+
import java.util.Collections;
5+
import java.util.HashSet;
6+
import java.util.List;
7+
import java.util.Map;
8+
import java.util.Set;
9+
10+
import edu.stanford.nlp.ling.IndexedWord;
11+
import edu.stanford.nlp.semgraph.SemanticGraph;
12+
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
13+
import edu.stanford.nlp.util.CoreMap;
14+
import edu.stanford.nlp.util.Pair;
15+
import edu.stanford.nlp.util.VariableStrings;
16+
17+
/**
18+
* At semgrex creation time, this takes a list of nodes or attributes.
19+
*<br>
20+
* At batch processing time, this pares a list of matches down to
21+
* one match for each matching attributes.
22+
*/
23+
public class UniqPattern extends SemgrexPattern {
24+
private static final long serialVersionUID = -38315768154569L;
25+
26+
private final SemgrexPattern child;
27+
private final List<String> keys;
28+
29+
public UniqPattern(SemgrexPattern child, List<String> keys) {
30+
this.child = child;
31+
this.keys = new ArrayList<>(keys);
32+
}
33+
34+
private String getKey(SemgrexMatch match, String key) {
35+
// TODO: could also do edge names or variable groups (once those exist)
36+
IndexedWord node = match.getNode(key);
37+
if (node == null) {
38+
return null;
39+
}
40+
return node.value();
41+
}
42+
43+
public List<Pair<CoreMap, List<SemgrexMatch>>> postprocessMatches(List<Pair<CoreMap, List<SemgrexMatch>>> matches) {
44+
// hashing lists should be okay here since the lists will not change
45+
// while the postprocessing is happening
46+
Set<List<String>> seenKeys = new HashSet<>();
47+
48+
List<Pair<CoreMap, List<SemgrexMatch>>> newMatches = new ArrayList<>();
49+
for (Pair<CoreMap, List<SemgrexMatch>> sentence : matches) {
50+
List<SemgrexMatch> newSentenceMatches = new ArrayList<>();
51+
for (SemgrexMatch match : sentence.second()) {
52+
List<String> matchKey = new ArrayList<>();
53+
for (String key : keys) {
54+
matchKey.add(getKey(match, key));
55+
}
56+
if (seenKeys.contains(matchKey)) {
57+
continue;
58+
}
59+
seenKeys.add(matchKey);
60+
newSentenceMatches.add(match);
61+
}
62+
if (newSentenceMatches.size() > 0) {
63+
newMatches.add(new Pair<>(sentence.first(), newSentenceMatches));
64+
}
65+
}
66+
67+
return newMatches;
68+
}
69+
70+
@Override
71+
public String localString() {
72+
return toString(true, false);
73+
}
74+
75+
@Override
76+
public String toString() {
77+
return toString(true, true);
78+
}
79+
80+
@Override
81+
public String toString(boolean hasPrecedence) {
82+
return toString(hasPrecedence, true);
83+
}
84+
85+
@Override
86+
public void setChild(SemgrexPattern n) {
87+
throw new UnsupportedOperationException("Child should only be set on a UniqPattern at creation time");
88+
}
89+
90+
@Override
91+
public List<SemgrexPattern> getChildren() {
92+
if (child == null) {
93+
return Collections.emptyList();
94+
} else {
95+
return Collections.singletonList(child);
96+
}
97+
}
98+
99+
public String toString(boolean hasPrecedence, boolean addChild) {
100+
StringBuilder sb = new StringBuilder();
101+
if (addChild) {
102+
sb.append(child.toString(true));
103+
}
104+
sb.append(" :: uniq");
105+
for (String key : keys) {
106+
sb.append(" ");
107+
sb.append(key);
108+
}
109+
return sb.toString();
110+
}
111+
112+
@Override
113+
public SemgrexMatcher matcher(SemanticGraph sg, IndexedWord node,
114+
Map<String, IndexedWord> namesToNodes,
115+
Map<String, String> namesToRelations,
116+
Map<String, SemanticGraphEdge> namesToEdges,
117+
VariableStrings variableStrings,
118+
boolean ignoreCase) {
119+
return child.matcher(sg, node, namesToNodes, namesToRelations, namesToEdges, variableStrings, ignoreCase);
120+
}
121+
122+
@Override
123+
public SemgrexMatcher matcher(SemanticGraph sg,
124+
Alignment alignment, SemanticGraph sg_align,
125+
boolean hyp, IndexedWord node,
126+
Map<String, IndexedWord> namesToNodes,
127+
Map<String, String> namesToRelations,
128+
Map<String, SemanticGraphEdge> namesToEdges,
129+
VariableStrings variableStrings,
130+
boolean ignoreCase) {
131+
return child.matcher(sg, alignment, sg_align, hyp, node, namesToNodes, namesToRelations, namesToEdges, variableStrings, ignoreCase);
132+
}
133+
}

0 commit comments

Comments
 (0)