Merge pull request #21 from antlr/features-reboot

parrt · parrt · commit b0a647d16c63 · 2016-04-07T10:05:04.000-07:00
Features reboot
diff --git a/java/src/org/antlr/codebuff/CollectFeatures.java b/java/src/org/antlr/codebuff/CollectFeatures.java
diff --git a/java/src/org/antlr/codebuff/Corpus.java b/java/src/org/antlr/codebuff/Corpus.java
@@ -77,8 +77,8 @@ public void randomShuffleInPlace() {
 	public void buildTokenContextIndex() {
 		curAndPrevTokenRuleIndexToVectorsMap = new HashMap<>();
 		for (int i=0; i<X.size(); i++) {
-			int curTokenRuleIndex = X.get(i)[CollectFeatures.INDEX_RULE];
-			int prevTokenRuleIndex = X.get(i)[CollectFeatures.INDEX_PREV_RULE];
+			int curTokenRuleIndex = X.get(i)[CollectFeatures.INDEX_PREV_EARLIEST_RIGHT_ANCESTOR];
+			int prevTokenRuleIndex = X.get(i)[CollectFeatures.INDEX_EARLIEST_LEFT_ANCESTOR];
 			int pr = CollectFeatures.unrulealt(prevTokenRuleIndex)[0];
 			int cr = CollectFeatures.unrulealt(curTokenRuleIndex)[0];
 			Pair<Integer, Integer> key = new Pair<>(pr, cr);
diff --git a/java/src/org/antlr/codebuff/FeatureType.java b/java/src/org/antlr/codebuff/FeatureType.java
@@ -1,7 +1,7 @@
 package org.antlr.codebuff;
 
 public enum FeatureType {
-	TOKEN(12), RULE(14), INT(7), BOOL(5), COL(7),
+	TOKEN(12), RULE(14), INT(12), BOOL(5), COL(7),
 	INFO_FILE(15), INFO_LINE(4), INFO_CHARPOS(4),
 	UNUSED(0);
 	public int displayWidth;
diff --git a/java/src/org/antlr/codebuff/Formatter.java b/java/src/org/antlr/codebuff/Formatter.java
@@ -19,12 +19,14 @@
 import static org.antlr.codebuff.CollectFeatures.CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN;
 import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_NL;
 import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_WS;
+import static org.antlr.codebuff.CollectFeatures.CAT_NO_ALIGNMENT;
 import static org.antlr.codebuff.CollectFeatures.FEATURES_ALIGN;
 import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_WS;
 import static org.antlr.codebuff.CollectFeatures.INDEX_FIRST_ON_LINE;
-import static org.antlr.codebuff.CollectFeatures.INDEX_PREV_END_COLUMN;
+import static org.antlr.codebuff.CollectFeatures.INDEX_MATCHING_TOKEN_DIFF_LINE;
 import static org.antlr.codebuff.CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD;
 import static org.antlr.codebuff.CollectFeatures.earliestAncestorStartingWithToken;
+import static org.antlr.codebuff.CollectFeatures.getMatchingSymbolOnDiffLine;
 import static org.antlr.codebuff.CollectFeatures.getNodeFeatures;
 import static org.antlr.codebuff.CollectFeatures.getRealTokens;
 import static org.antlr.codebuff.CollectFeatures.getTokensOnPreviousLine;
@@ -46,7 +48,6 @@ public class Formatter {
 	protected Vector<TokenPositionAnalysis> analysis = new Vector<>();
 
 	protected CodekNNClassifier nlwsClassifier;
-	protected CodekNNClassifier wsClassifier;
 	protected CodekNNClassifier alignClassifier;
 	protected int k;
 
@@ -112,13 +113,14 @@ public String format() {
 	public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
 		CommonToken curToken = (CommonToken)tokens.get(tokenIndexInStream);
 		String tokText = curToken.getText();
+		TerminalNode node = tokenToNodeMap.get(curToken);
 
 		emitCommentsToTheLeft(tokenIndexInStream);
 
 		int[] features = getNodeFeatures(tokenToNodeMap, doc, tokenIndexInStream, line, tabSize);
 		// must set "prev end column" value as token stream doesn't have it;
 		// we're tracking it as we emit tokens
-		features[INDEX_PREV_END_COLUMN] = charPosInLine;
+//		features[INDEX_PREV_END_COLUMN] = charPosInLine;
 
 		int injectNL_WS = nlwsClassifier.classify(k, features, corpus.injectWhitespace, MAX_CONTEXT_DIFF_THRESHOLD);
 		int newlines = 0;
@@ -130,23 +132,15 @@ else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) {
 			ws = CollectFeatures.unwscat(injectNL_WS);
 		}
 
-		// getNodeFeatures() also doesn't know what line curToken is on. If \n, we need to find exemplars that start a line
-		features[INDEX_FIRST_ON_LINE] = newlines; // use \n prediction to match exemplars for alignment
-
-		int align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);
-
-		TokenPositionAnalysis tokenPositionAnalysis =
-			getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws);
-		analysis.setSize(tokenIndexInStream+1);
-		analysis.set(tokenIndexInStream, tokenPositionAnalysis);
-
 		if ( ws==0 && cannotJoin(realTokens.get(indexIntoRealTokens-1), curToken) ) { // failsafe!
 			ws = 1;
 		}
 
+		int align = CAT_NO_ALIGNMENT;
+
 		if ( newlines>0 ) {
 			output.append(Tool.newlines(newlines));
-			line++;
+			line+=newlines;
 			charPosInLine = 0;
 
 			List<Token> tokensOnPreviousLine = getTokensOnPreviousLine(tokens, tokenIndexInStream, line);
@@ -155,9 +149,15 @@ else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) {
 				firstTokenOnPrevLine = tokensOnPreviousLine.get(0);
 			}
 
-			TerminalNode node = tokenToNodeMap.get(curToken);
 			ParserRuleContext parent = (ParserRuleContext)node.getParent();
 
+			// getNodeFeatures() doesn't know what line curToken is on. If \n, we need to find exemplars that start a line
+			features[INDEX_FIRST_ON_LINE] = newlines>0 ? 1 : 0; // use \n prediction to match exemplars for alignment
+			// if we decide to inject a newline, we better recompute this value before classifying alignment
+			features[INDEX_MATCHING_TOKEN_DIFF_LINE] = getMatchingSymbolOnDiffLine(doc, node, line);
+
+			align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);
+
 			if ( align==CAT_INDENT ) {
 				if ( firstTokenOnPrevLine!=null ) { // if not on first line, we cannot indent
 					int indentedCol = firstTokenOnPrevLine.getCharPositionInLine()+INDENT_LEVEL;
@@ -169,10 +169,7 @@ else if ( (align&0xFF)==CAT_ALIGN_WITH_ANCESTOR_CHILD ) {
 				int[] deltaChild = CollectFeatures.unaligncat(align);
 				int deltaFromAncestor = deltaChild[0];
 				int childIndex = deltaChild[1];
-				ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken);
-				if ( earliestLeftAncestor==null ) {
-					earliestLeftAncestor = parent;
-				}
+				ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken);
 				ParserRuleContext ancestor = CollectFeatures.getAncestor(earliestLeftAncestor, deltaFromAncestor);
 				ParseTree child = ancestor.getChild(childIndex);
 				Token start = null;
@@ -194,10 +191,7 @@ else if ( child instanceof TerminalNode ){
 			}
 			else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) {
 				int deltaFromAncestor = CollectFeatures.unindentcat(align);
-				ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken);
-				if ( earliestLeftAncestor==null ) {
-					earliestLeftAncestor = parent;
-				}
+				ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken);
 				ParserRuleContext ancestor = CollectFeatures.getAncestor(earliestLeftAncestor, deltaFromAncestor);
 				Token start = ancestor.getStart();
 				int indentCol = start.getCharPositionInLine() + INDENT_LEVEL;
@@ -211,6 +205,11 @@ else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) {
 			charPosInLine += ws;
 		}
 
+		TokenPositionAnalysis tokenPositionAnalysis =
+			getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws);
+		analysis.setSize(tokenIndexInStream+1);
+		analysis.set(tokenIndexInStream, tokenPositionAnalysis);
+
 		// update Token object with position information now that we are about
 		// to emit it.
 		curToken.setLine(line);
@@ -227,6 +226,10 @@ else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) {
 	/** Look into the token stream to get the comments to the left of current
 	 *  token. Emit all whitespace and comments except for whitespace at the
 	 *  end as we'll inject that per newline prediction.
+	 *
+	 *  This assumes we are grooming not totally reformatting.
+	 *  We able to see original input stream for comment purposes. With all
+	 *  whitespace removed, we can't emit this stuff properly at moment.
 	 */
 	public void emitCommentsToTheLeft(int tokenIndexInStream) {
 		List<Token> hiddenTokensToLeft = tokens.getHiddenTokensToLeft(tokenIndexInStream);
@@ -270,7 +273,7 @@ public void emitCommentsToTheLeft(int tokenIndexInStream) {
 
 	public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealTokens, int tokenIndexInStream,
 	                                              int injectNewline,
-	                                              int alignWithPrevious,
+	                                              int align,
 	                                              int ws)
 	{
 		CommonToken curToken = (CommonToken)tokens.get(tokenIndexInStream);
@@ -286,12 +289,11 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT
 
 		boolean prevIsWS = prevToken.getChannel()==Token.HIDDEN_CHANNEL; // assume this means whitespace
 		int actualNL = Tool.count(prevToken.getText(), '\n');
-		int actualWS = Tool.count(prevToken.getText(), ' ');
-		String newlinePredictionString = String.format("### line %d: predicted %d \\n actual %s",
+		String newlinePredictionString = String.format("### line %d: predicted %d \\n actual ?",
 		                                               originalCurToken.getLine(), injectNewline, prevIsWS ? actualNL : "none");
-		String alignPredictionString = String.format("### line %d: predicted %s actual %s",
+		String alignPredictionString = String.format("### line %d: predicted %d actual %s",
 		                                             originalCurToken.getLine(),
-		                                             alignWithPrevious==1?"align":"unaligned",
+		                                             align,
 		                                             "?");
 
 		String newlineAnalysis = newlinePredictionString+"\n"+
diff --git a/java/src/org/antlr/codebuff/Neighbor.java b/java/src/org/antlr/codebuff/Neighbor.java
@@ -17,7 +17,7 @@ public String toString(FeatureMetaData[] FEATURES, List<Integer> Y) {
 		int[] X = corpus.X.get(corpusVectorIndex);
 		InputDocument doc = corpus.documents.get(corpusVectorIndex);
 		String features = CollectFeatures._toString(FEATURES, doc, X);
-		int line = CollectFeatures.getInfoLine(X);
+		int line = X[CollectFeatures.INDEX_INFO_LINE];
 		String lineText = doc.getLine(line);
 		int col = X[CollectFeatures.INDEX_INFO_CHARPOS];
 		// insert a dot right before char position
diff --git a/java/src/org/antlr/codebuff/kNNClassifier.java b/java/src/org/antlr/codebuff/kNNClassifier.java
@@ -125,8 +125,8 @@ public Neighbor[] kNN(int[] unknown, int k, double distanceThreshold) {
 	}
 
 	public Neighbor[] distances(int[] unknown, double distanceThreshold) {
-		int curTokenRuleIndex = unknown[CollectFeatures.INDEX_RULE];
-		int prevTokenRuleIndex = unknown[CollectFeatures.INDEX_PREV_RULE];
+		int curTokenRuleIndex = unknown[CollectFeatures.INDEX_PREV_EARLIEST_RIGHT_ANCESTOR];
+		int prevTokenRuleIndex = unknown[CollectFeatures.INDEX_EARLIEST_LEFT_ANCESTOR];
 		int pr = CollectFeatures.unrulealt(prevTokenRuleIndex)[0];
 		int cr = CollectFeatures.unrulealt(curTokenRuleIndex)[0];
 		Pair<Integer, Integer> key =  new Pair<>(pr, cr);