Merge pull request #124 from percyliang/geo880

percyliang · web-flow · commit 3ed286f542aa · 2016-12-06T18:18:23.000-08:00
adding a geo880 module as a playground
diff --git a/build.xml b/build.xml
@@ -88,6 +88,15 @@
     <jar destfile="${libsempre}/sempre-overnight.jar" basedir="${classes}/overnight"/>
   </target>
 
+  <!-- Compile geo880 -->
+  <target name="geo880" depends="init,core,corenlp,tables">
+    <echo message="Compiling ${ant.project.name}: geo880"/>
+    <mkdir dir="${classes}/geo880"/>
+    <javac srcdir="${src}" destdir="${classes}/geo880" classpathref="lib.path" debug="true" includeantruntime="false" source="${source}" target="${target}">
+      <include name="edu/stanford/nlp/sempre/geo880/"/>
+    </javac>
+    <jar destfile="${libsempre}/sempre-geo880.jar" basedir="${classes}/geo880"/>
+  </target>
 
   <!-- Clean up -->
   <target name="clean">
diff --git a/pull-dependencies b/pull-dependencies
@@ -200,6 +200,15 @@ addModule('esslli_2016', 'Data for ESSLLI 2016 semantic parsing class', lambda {
   pull('/u/nlp/data/semparse/esslli_2016', 'data/esslli_2016/', {:symlink => true})
 })
 
+addModule('geo880', 'Data, lexicon, grammars and KB for geo880', lambda {
+  pull('/u/nlp/data/semparse/geo880/geo880-test.examples', 'data/geo880', {:symlink => true})
+  pull('/u/nlp/data/semparse/geo880/geo880-test.preprocessed.examples', 'data/geo880', {:symlink => true})
+  pull('/u/nlp/data/semparse/geo880/geo880-train.preprocessed.examples', 'data/geo880', {:symlink => true})
+  pull('/u/nlp/data/semparse/geo880/geo880.grammar', 'data/geo880', {:symlink => true})
+  pull('/u/nlp/data/semparse/geo880/geo880.lexicon', 'data/geo880', {:symlink => true})
+  pull('/u/nlp/data/semparse/geo880/geo880.kg', 'data/geo880', {:symlink => true})
+  pull('/u/nlp/data/semparse/geo880/geo880.type_hierarchy', 'data/geo880', {:symlink => true})
+})
 ############################################################
 
 if ARGV.size == 0
diff --git a/run b/run
@@ -918,6 +918,72 @@ addMode('genovernight-wrapper', 'Generate utterances for overnight semantic pars
   lambda { |e| system 'mkdir -p genovernight.out'; o('execDir', 'genovernight.out/' + e[:domain]) },
 nil) })
 
+addMode('geo880', 'Semantic parsing on the geo880 dataset', lambda { |e| l(
+  # Usual header
+  header('core,tables,corenlp,geo880'),
+  'edu.stanford.nlp.sempre.Main',
+  # Fig parameters
+  figOpts,
+  o('executor', 'tables.lambdadcs.LambdaDCSExecutor'),
+  o('JoinFn.specializedTypeCheck', false), o('JoinFn.typeInference', false),
+  # Parser
+  o('Builder.parser', 'BeamParser'),
+  o('Parser.coarsePrune'),
+ 
+  # Evaluation
+  o('Builder.valueEvaluator', 'geo880.Geo880ValueEvaluator'),
+
+  # Grammar
+  o('Grammar.inPaths','lib/data/geo880/geo880.grammar'),
+
+  # Type hierarchy
+  o('Geo880TypeLookup.typeHierarchyPath', 'lib/data/geo880/geo880.type_hierarchy'),
+  o('TypeInference.typeLookup','geo880.Geo880TypeLookup'),
+
+  # Yrkvpba
+  o('SimpleLexicon.inPaths', 'lib/data/geo880/geo880.lexicon'),
+  
+  # Learner
+  o('Learner.maxTrainIters', 3),
+
+  # Dataset
+  letDefault(:data, 0),
+  sel(:data,
+    l(o('Dataset.inPaths', 'train,lib/data/geo880/geo880-train.preprocessed.examples'), unbalancedTrainDevSplit), # (0) train 0.8, dev 0.2
+    l(o('Dataset.inPaths', 'train,lib/data/geo880/geo880-train.examples', 'test,lib/data/geo880/geo880-test.preprocessed/examples')), # (1) Don't run on test yet!
+  nil),
+  # Load the graph
+  o('Dataset.globalGraphPath', 'lib/data/geo880/geo880.kg'),
+  # Verbosity
+  letDefault(:verbose, 0),
+  sel(:verbose,
+    l(),
+    l(
+      o('showRules'),
+      o('Parser.verbose', 2),
+      o('JoinFn.verbose', 3),
+      o('JoinFn.showTypeCheckFailures'),
+    nil),
+  nil),
+  # Language Analyzer
+  l(o('LanguageAnalyzer', 'corenlp.CoreNLPAnalyzer'), o('annotators', *'tokenize ssplit pos lemma ner'.split)),
+  # Regularization
+  letDefault(:l1, 0),
+  sel(:l1,
+    l(),
+    l(o('Params.l1Reg','lazy'), o('Params.l1RegCoeff', '3e-5')),
+    l(o('Params.l1Reg','lazy'), selo(nil, 'Params.l1RegCoeff', 0, 0.00001, 0.0001, 0.001, 0.01)),
+  nil),
+  # Features
+  letDefault(:feat, 'freebase'),
+  sel(:feat, {
+    'none' => l(),   # No features (random)
+    'freebase' => l(
+      o('FeatureExtractor.featureDomains', 'rule opCount constant whType span lemmaAndBinaries denotation lexAlign joinPos skipPos'.split),
+#      o('FeatureExtractor.featureDomains', 'rule opCount constant whType lemmaAndBinaries denotation lexAlign joinPos skipPos'.split),
+    nil),
+  }),
+nil) })
 
 ############################################################
 
diff --git a/src/edu/stanford/nlp/sempre/ContextValue.java b/src/edu/stanford/nlp/sempre/ContextValue.java
@@ -68,7 +68,7 @@ public ContextValue(String user, DateValue date, List<Exchange> exchanges) {
   }
 
   public ContextValue(KnowledgeGraph graph) {
-    this(null, null, null, graph);
+    this(null, null, new ArrayList(), graph);
   }
 
   // Example:
@@ -107,8 +107,11 @@ public LispTree toLispTree() {
       tree.addChild(LispTree.proto.newList("user", user));
     if (date != null)
       tree.addChild(date.toLispTree());
+		// When logging examples, logging the entire graph takes too much screen space.
+		// I don't think that we ever deserialize a graph from a serialized context,
+		// so this should be fine.
     if (graph != null)
-      tree.addChild(graph.toLispTree());
+      tree.addChild(graph.toShortLispTree());
     for (Exchange e : exchanges)
       tree.addChild(LispTree.proto.newList("exchange", e.toLispTree()));
     return tree;
diff --git a/src/edu/stanford/nlp/sempre/Dataset.java b/src/edu/stanford/nlp/sempre/Dataset.java
@@ -38,6 +38,9 @@ public static class Options {
 
     @Option(gloss = "Only keep examples which have at most this number of tokens")
     public int maxTokens = Integer.MAX_VALUE;
+
+    @Option(gloss = "Path to a knowledge graph that will be uploaded as global context")
+    public String globalGraphPath;
   }
 
   public static Options opts = new Options();
@@ -96,10 +99,22 @@ public void readFromPathPairs(List<Pair<String, String>> pathPairs) {
         return;
       }
     }
-
     readLispTreeFromPathPairs(pathPairs);
+    updateGlobalContext();
+  }
+
+  private void updateGlobalContext() {
+    if (opts.globalGraphPath != null) {
+      KnowledgeGraph graph = NaiveKnowledgeGraph.fromFile(opts.globalGraphPath);
+      for (String group : allExamples.keySet()) {
+        for (Example ex : allExamples.get(group)) {
+          ex.setContext(new ContextValue(graph));
+        }
+      }
+    }
   }
 
+
   private void readJsonFromPathPairs(List<Pair<String, String>> pathPairs) {
     List<GroupInfo> groups = Lists.newArrayListWithCapacity(pathPairs.size());
     for (Pair<String, String> pathPair : pathPairs) {
diff --git a/src/edu/stanford/nlp/sempre/FeatureExtractor.java b/src/edu/stanford/nlp/sempre/FeatureExtractor.java
@@ -211,10 +211,12 @@ void conjoinLemmaAndBinary(Example ex, Derivation deriv) {
     List<String> nonEntityLemmas = new LinkedList<>();
     extractNonEntityLemmas(ex, deriv, nonEntityLemmas);
     List<String> binaries = extractBinaries(deriv.formula);
-    String binariesStr = Joiner.on('_').join(binaries);
-    for (String nonEntityLemma : nonEntityLemmas) {
-      deriv.addFeature("lemmaAndBinaries", "nonEntitylemmas=" + nonEntityLemma +
-              ",binaries=" + binariesStr);
+    if (!binaries.isEmpty()) {
+      String binariesStr = Joiner.on('_').join(binaries);
+      for (String nonEntityLemma : nonEntityLemmas) {
+        deriv.addFeature("lemmaAndBinaries", "nonEntitylemmas=" + nonEntityLemma +
+          ",binaries=" + binariesStr);
+      }
     }
   }
 
diff --git a/src/edu/stanford/nlp/sempre/KnowledgeGraph.java b/src/edu/stanford/nlp/sempre/KnowledgeGraph.java
@@ -85,6 +85,7 @@ public static List<Pair<Value, Value>> getReversedPairs(Collection<Pair<Value, V
   // ============================================================
 
   public abstract LispTree toLispTree();
+  public abstract LispTree toShortLispTree();
   @Override public String toString() { return toLispTree().toString(); }
 
   /** Return all y such that x in firsts and (x,r,y) in graph */
diff --git a/src/edu/stanford/nlp/sempre/NaiveKnowledgeGraph.java b/src/edu/stanford/nlp/sempre/NaiveKnowledgeGraph.java
@@ -198,4 +198,16 @@ public LispTree toLispTree() {
     }
     return tree;
   }
+
+  @Override
+  public LispTree toShortLispTree() {
+    if (triples.size() > 1000) {
+      LispTree tree = LispTree.proto.newList();
+      tree.addChild("graph");
+      tree.addChild("NaiveKnowledgeGraph");
+      tree.addChild(("TooManyTriples"));
+      return tree;
+    }
+    return toLispTree();
+  }
 }
diff --git a/src/edu/stanford/nlp/sempre/geo880/Geo880TypeLookup.java b/src/edu/stanford/nlp/sempre/geo880/Geo880TypeLookup.java
@@ -0,0 +1,110 @@
+package edu.stanford.nlp.sempre.geo880;
+
+import edu.stanford.nlp.sempre.SemType;
+import edu.stanford.nlp.sempre.SemTypeHierarchy;
+import edu.stanford.nlp.sempre.TypeLookup;
+import fig.basic.IOUtils;
+import fig.basic.Option;
+import fig.basic.LogInfo;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Type lookup for the geo880 domain, Mostly for distinguishing locations and numbers.
+ * We also use a type hierarchy provided by a file to match |location.us_state| and |location.location| etc.
+ * Created by joberant on 05/12/2016.
+ */
+public class Geo880TypeLookup implements TypeLookup{
+  public static class Options {
+    @Option(gloss = "Verbosity") public int verbose = 0;
+    @Option(gloss = "A path to a file that specified the type hierarchy.")
+    public String typeHierarchyPath;
+
+  }
+  public static Options opts = new Options();
+  public static final String LOCATION = "fb:location.location";
+  public static final String CITY = "fb:location.citytown";
+  public static final String STATE = "fb:location.us_state";
+  public static final String RIVER = "fb:location.river";
+  public static final String LAKE = "fb:location.lake";
+  public static final String MOUNTAIN = "fb:location.mountain";
+  public static final String COUNTRY = "fb:location.country";
+
+  public Geo880TypeLookup() {
+    SemTypeHierarchy semTypeHierarchy = SemTypeHierarchy.singleton;
+    if (opts.typeHierarchyPath != null) {
+      try {
+        for (String line : IOUtils.readLines(opts.typeHierarchyPath)) {
+          String[] tokens = line.split("\\s+");
+
+          // Check the file only contains relations about supertypes.
+          assert tokens[1].endsWith("included_types");
+          semTypeHierarchy.addSupertype(tokens[0], tokens[0]);
+          semTypeHierarchy.addSupertype(tokens[2], tokens[2]);
+          semTypeHierarchy.addSupertype(tokens[0], tokens[2]);
+        }
+      } catch (IOException e) {
+        e.printStackTrace();
+        throw new RuntimeException("Could not read lines from: " + opts.typeHierarchyPath);
+      }
+    }
+  }
+
+  @Override
+  public SemType getEntityType(String entity) {
+    // Entites are of the form fb:state.florida.
+    int colonIndex = entity.indexOf(':');
+    int dotIndex = entity.indexOf('.');
+    String type = entity.substring(colonIndex+1, dotIndex);
+
+    if (type.equals("place")) {
+      type = LOCATION;
+    }
+    else if (type.equals("city")) {
+      type = CITY;
+    }
+    else if (type.equals("state")) {
+      type = STATE;
+    }
+    else if (type.equals("river")) {
+      type = RIVER;
+    }
+    else if (type.equals("lake")) {
+      type = LAKE;
+    }
+    else if (type.equals("mountain")) {
+      type = MOUNTAIN;
+    }
+    else if (type.equals("country")) {
+      type = COUNTRY;
+    }
+    else {
+      throw new RuntimeException("Illegal entity: " + entity);
+    }
+    SemType result = SemType.newUnionSemType(type);
+    if (opts.verbose >= 1) {
+      LogInfo.logs("Entity=%s, Type=%s", entity, result);
+    }
+    return result;
+  }
+
+  @Override
+  public SemType getPropertyType(String property) {
+    // Properties are of the form fb:location.location.population.
+    String arg1 = property.substring(0, property.lastIndexOf('.'));
+    String suffix = property.substring(property.lastIndexOf('.') + 1);
+    String arg2 = LOCATION;
+    if (suffix.equals("density") || suffix.equals("elevation") ||
+        suffix.equals("population") || suffix.equals("size") ||
+        suffix.equals("area") || suffix.equals("length")) {
+      arg2 = "fb:type.number";
+    }
+    SemType result = SemType.newFuncSemType(arg2, arg1);
+    if (opts.verbose >= 1) {
+      LogInfo.logs("Property=%s, Type=%s", property, result);
+    }
+    return result;
+  }
+}
diff --git a/src/edu/stanford/nlp/sempre/geo880/Geo880ValueEvaluator.java b/src/edu/stanford/nlp/sempre/geo880/Geo880ValueEvaluator.java
@@ -0,0 +1,85 @@
+package edu.stanford.nlp.sempre.geo880;
+
+import edu.stanford.nlp.sempre.*;
+import edu.stanford.nlp.sempre.tables.StringNormalizationUtils;
+import fig.basic.LogInfo;
+
+import java.util.List;
+
+/**
+ * This is only used because the data does not mention when a city is in the usa, but
+ * the kg returns usa, and we want to use exact match, so we add this logic here.
+ * Created by joberant on 03/12/2016.
+ */
+public class Geo880ValueEvaluator implements ValueEvaluator {
+
+  public double getCompatibility(Value target, Value pred) {
+    List<Value> targetList = ((ListValue) target).values;
+    if (!(pred instanceof ListValue)) return 0;
+    List<Value> predList = ((ListValue) pred).values;
+
+    // In geo880, if we return that something is contained in a state, there is no need to return fb:country.usa
+    Value toDelete = null;
+    if (predList.size() > 1 && predList.get(0) instanceof NameValue) {
+      for (Value v: predList) {
+        String id = ((NameValue) v).id;
+        if (id.equals("fb:country.usa")) {
+          toDelete = v;
+          break;
+        }
+      }
+    }
+    if (toDelete != null) {
+      predList.remove(toDelete);
+    }
+
+    if (targetList.size() != predList.size()) return 0;
+
+    for (Value targetValue : targetList) {
+      boolean found = false;
+      for (Value predValue : predList) {
+        if (getItemCompatibility(targetValue, predValue)) {
+          found = true;
+          break;
+        }
+      }
+      if (!found) return 0;
+    }
+    return 1;
+  }
+
+  // ============================================================
+  // Item Compatibility
+  // ============================================================
+
+  // Compare one element of the list.
+  protected boolean getItemCompatibility(Value target, Value pred) {
+    if (pred instanceof ErrorValue) return false;  // Never award points for error
+    if (pred == null) {
+      LogInfo.warning("Predicted value is null!");
+      return false;
+    }
+
+    if (target instanceof DescriptionValue) {
+      String targetText = ((DescriptionValue) target).value;
+      if (pred instanceof NameValue) {
+        // Just has to match the description
+        String predText = ((NameValue) pred).description;
+        if (predText == null) predText = "";
+        return targetText.equals(predText);
+      }
+    } else if (target instanceof NumberValue) {
+      NumberValue targetNumber = (NumberValue) target;
+      if (pred instanceof NumberValue) {
+        return compareNumberValues(targetNumber, (NumberValue) pred);
+      }
+    }
+
+    return target.equals(pred);
+  }
+
+  protected boolean compareNumberValues(NumberValue target, NumberValue pred) {
+    return Math.abs(target.value - pred.value) < 1e-6;
+  }
+
+}
diff --git a/src/edu/stanford/nlp/sempre/tables/TableKnowledgeGraph.java b/src/edu/stanford/nlp/sempre/tables/TableKnowledgeGraph.java

Original file line number	Diff line number	Diff line change
`@@ -198,4 +198,16 @@ public LispTree toLispTree() {`
`198`	`198`	`}`
`199`	`199`	`return tree;`
`200`	`200`	`}`
	`201`	`+`
	`202`	`+ @Override`
	`203`	`+ public LispTree toShortLispTree() {`
	`204`	`+ if (triples.size() > 1000) {`
	`205`	`+ LispTree tree = LispTree.proto.newList();`
	`206`	`+ tree.addChild("graph");`
	`207`	`+ tree.addChild("NaiveKnowledgeGraph");`
	`208`	`+ tree.addChild(("TooManyTriples"));`
	`209`	`+ return tree;`
	`210`	`+ }`
	`211`	`+ return toLispTree();`
	`212`	`+ }`
`201`	`213`	`}`