Skip to content

Commit 1edb99a

Browse files
committed
Add the ability to semgrex to uniq matches based on a regex varstring, including a couple tests of the ability. Also checks for name conflicts with nodes so that there are no weird ambiguous cases
1 parent 3b60637 commit 1edb99a

File tree

4 files changed

+57
-8
lines changed

4 files changed

+57
-8
lines changed

src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ class SemgrexParser implements SemgrexParserConstants {
2121
// keep track of which variables we've already seen
2222
// lets us make sure we don't name new nodes under a negation
2323
private Set<String> knownVariables = Generics.newHashSet();
24+
// keep track of which regex variable groups we've already seen
25+
// useful for allowing uniq to operate on
26+
private Set<String> knownVarGroups = Generics.newHashSet();
2427

2528
private static final Redwood.RedwoodChannels log = Redwood.channels(SemgrexParser.class);
2629
private boolean deprecatedAmp = false;
@@ -122,8 +125,11 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
122125
uniqKeys.add(nextIdentifier.image);
123126
}
124127
for (String key : uniqKeys) {
125-
if (!knownVariables.contains(key)) {
126-
{if (true) throw new SemgrexParseException("Semgrex pattern asked for uniq of node " + key + " which does not exist in the pattern");}
128+
if (!knownVariables.contains(key) && !knownVarGroups.contains(key)) {
129+
{if (true) throw new SemgrexParseException("Semgrex pattern asked for uniq of node " + key + " which does not exist in the pattern (as a node or regex)");}
130+
}
131+
if (knownVariables.contains(key) && knownVarGroups.contains(key)) {
132+
{if (true) throw new SemgrexParseException("Semgrex pattern asked for uniq of node " + key + " which is very confusing, as it is both a node and a regex. Please rename one of them");}
127133
}
128134
}
129135
// TODO: can error check that the keys are unique between node and edge names
@@ -633,6 +639,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
633639
groupVar = identifier();
634640
// TODO: this should have been NUMBER, but that doesn't seem to exist
635641
varGroups.add(new Pair<Integer,String>(Integer.parseInt(groupNum.image),groupVar.image));
642+
knownVarGroups.add(groupVar.image);
636643
}
637644
if (attr != null && value != null) {
638645
negated = attrType.image.equals("!:");

src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.jj

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ class SemgrexParser {
2727
// keep track of which variables we've already seen
2828
// lets us make sure we don't name new nodes under a negation
2929
private Set<String> knownVariables = Generics.newHashSet();
30+
// keep track of which regex variable groups we've already seen
31+
// useful for allowing uniq to operate on
32+
private Set<String> knownVarGroups = Generics.newHashSet();
3033

3134
private static final Redwood.RedwoodChannels log = Redwood.channels(SemgrexParser.class);
3235
private boolean deprecatedAmp = false;
@@ -116,8 +119,11 @@ SemgrexPattern Root() : {
116119
"::" <UNIQ> { uniqKeys = new ArrayList<>(); } (nextIdentifier = identifier() { uniqKeys.add(nextIdentifier.image); })*
117120
{
118121
for (String key : uniqKeys) {
119-
if (!knownVariables.contains(key)) {
120-
throw new SemgrexParseException("Semgrex pattern asked for uniq of node " + key + " which does not exist in the pattern");
122+
if (!knownVariables.contains(key) && !knownVarGroups.contains(key)) {
123+
throw new SemgrexParseException("Semgrex pattern asked for uniq of node " + key + " which does not exist in the pattern (as a node or regex)");
124+
}
125+
if (knownVariables.contains(key) && knownVarGroups.contains(key)) {
126+
throw new SemgrexParseException("Semgrex pattern asked for uniq of node " + key + " which is very confusing, as it is both a node and a regex. Please rename one of them");
121127
}
122128
}
123129
// TODO: can error check that the keys are unique between node and edge names
@@ -316,6 +322,7 @@ void AddAttribute(NodeAttributes attributes) : {
316322
( ( ( "#" groupNum = identifier() "%" groupVar = identifier() ) {
317323
// TODO: this should have been NUMBER, but that doesn't seem to exist
318324
varGroups.add(new Pair<Integer,String>(Integer.parseInt(groupNum.image),groupVar.image));
325+
knownVarGroups.add(groupVar.image);
319326
} )* )
320327
{
321328
if (attr != null && value != null) {

src/edu/stanford/nlp/semgraph/semgrex/UniqPattern.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,16 @@ public UniqPattern(SemgrexPattern child, List<String> keys) {
3232
}
3333

3434
private String getKey(SemgrexMatch match, String key) {
35-
// TODO: could also do edge names or variable groups (once those exist)
35+
// TODO: could also do edge names
3636
IndexedWord node = match.getNode(key);
37-
if (node == null) {
38-
return null;
37+
if (node != null) {
38+
return node.value();
3939
}
40-
return node.value();
40+
String varString = match.getVariableString(key);
41+
if (varString != null) {
42+
return varString;
43+
}
44+
return null;
4145
}
4246

4347
public List<Pair<CoreMap, List<SemgrexMatch>>> postprocessMatches(List<Pair<CoreMap, List<SemgrexMatch>>> matches, boolean keepEmptyMatches) {

test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,12 +1524,32 @@ public void testBrokenUniq() {
15241524
}
15251525
}
15261526

1527+
/**
1528+
* Test that an illegal uniq expression throws an exception when both node and regex are named
1529+
*<br>
1530+
* Specifically, the expectation is for a SemgrexParseException
1531+
*/
1532+
public void testOverlappingUniq() {
1533+
try {
1534+
String pattern = "{word:__#1%foo}=foo :: uniq foo";
1535+
SemgrexPattern semgrex = SemgrexPattern.compile(pattern);
1536+
throw new RuntimeException("This expression should fail because the node name and regex name overlap");
1537+
} catch (SemgrexParseException e) {
1538+
// yay
1539+
}
1540+
}
1541+
15271542
/**
15281543
* Test that a simple uniq expression is correctly parsed
15291544
*/
15301545
public void testParsesUniq() {
1546+
// Test the basic node name compilation
15311547
String pattern = "{word:foo}=foo :: uniq foo";
15321548
SemgrexPattern semgrex = SemgrexPattern.compile(pattern);
1549+
1550+
// Test the basic regex compilation
1551+
pattern = "{word:__#1%foo} :: uniq foo";
1552+
semgrex = SemgrexPattern.compile(pattern);
15331553
}
15341554

15351555
/**
@@ -1570,6 +1590,17 @@ public void testBatchUniq() {
15701590
assertEquals(1, matches.get(1).second().size());
15711591
assertEquals(BATCH_PARSES[3], matches.get(2).first().get(CoreAnnotations.TextAnnotation.class));
15721592
assertEquals(1, matches.get(2).second().size());
1593+
1594+
// test the uniq operator on a regex match
1595+
semgrex = SemgrexPattern.compile("{word:__#1%x} !< {} :: uniq x");
1596+
matches = semgrex.matchSentences(sentences, false);
1597+
assertEquals(2, matches.size());
1598+
assertEquals(BATCH_PARSES[0], matches.get(0).first().get(CoreAnnotations.TextAnnotation.class));
1599+
assertEquals(1, matches.get(0).second().size());
1600+
assertEquals("foo", matches.get(0).second().get(0).getVariableString("x"));
1601+
assertEquals(BATCH_PARSES[2], matches.get(1).first().get(CoreAnnotations.TextAnnotation.class));
1602+
assertEquals(1, matches.get(1).second().size());
1603+
assertEquals("bar", matches.get(1).second().get(0).getVariableString("x"));
15731604
}
15741605

15751606
public void testRegexVariableGroups() {

0 commit comments

Comments
 (0)