Skip to content

Commit b0a647d

Browse files
committed
Merge pull request #21 from antlr/features-reboot
Features reboot
2 parents a3e0d11 + dc56ce6 commit b0a647d

File tree

6 files changed

+184
-237
lines changed

6 files changed

+184
-237
lines changed

java/src/org/antlr/codebuff/CollectFeatures.java

Lines changed: 148 additions & 203 deletions
Large diffs are not rendered by default.

java/src/org/antlr/codebuff/Corpus.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ public void randomShuffleInPlace() {
7777
public void buildTokenContextIndex() {
7878
curAndPrevTokenRuleIndexToVectorsMap = new HashMap<>();
7979
for (int i=0; i<X.size(); i++) {
80-
int curTokenRuleIndex = X.get(i)[CollectFeatures.INDEX_RULE];
81-
int prevTokenRuleIndex = X.get(i)[CollectFeatures.INDEX_PREV_RULE];
80+
int curTokenRuleIndex = X.get(i)[CollectFeatures.INDEX_PREV_EARLIEST_RIGHT_ANCESTOR];
81+
int prevTokenRuleIndex = X.get(i)[CollectFeatures.INDEX_EARLIEST_LEFT_ANCESTOR];
8282
int pr = CollectFeatures.unrulealt(prevTokenRuleIndex)[0];
8383
int cr = CollectFeatures.unrulealt(curTokenRuleIndex)[0];
8484
Pair<Integer, Integer> key = new Pair<>(pr, cr);

java/src/org/antlr/codebuff/FeatureType.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package org.antlr.codebuff;
22

33
public enum FeatureType {
4-
TOKEN(12), RULE(14), INT(7), BOOL(5), COL(7),
4+
TOKEN(12), RULE(14), INT(12), BOOL(5), COL(7),
55
INFO_FILE(15), INFO_LINE(4), INFO_CHARPOS(4),
66
UNUSED(0);
77
public int displayWidth;

java/src/org/antlr/codebuff/Formatter.java

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@
1919
import static org.antlr.codebuff.CollectFeatures.CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN;
2020
import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_NL;
2121
import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_WS;
22+
import static org.antlr.codebuff.CollectFeatures.CAT_NO_ALIGNMENT;
2223
import static org.antlr.codebuff.CollectFeatures.FEATURES_ALIGN;
2324
import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_WS;
2425
import static org.antlr.codebuff.CollectFeatures.INDEX_FIRST_ON_LINE;
25-
import static org.antlr.codebuff.CollectFeatures.INDEX_PREV_END_COLUMN;
26+
import static org.antlr.codebuff.CollectFeatures.INDEX_MATCHING_TOKEN_DIFF_LINE;
2627
import static org.antlr.codebuff.CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD;
2728
import static org.antlr.codebuff.CollectFeatures.earliestAncestorStartingWithToken;
29+
import static org.antlr.codebuff.CollectFeatures.getMatchingSymbolOnDiffLine;
2830
import static org.antlr.codebuff.CollectFeatures.getNodeFeatures;
2931
import static org.antlr.codebuff.CollectFeatures.getRealTokens;
3032
import static org.antlr.codebuff.CollectFeatures.getTokensOnPreviousLine;
@@ -46,7 +48,6 @@ public class Formatter {
4648
protected Vector<TokenPositionAnalysis> analysis = new Vector<>();
4749

4850
protected CodekNNClassifier nlwsClassifier;
49-
protected CodekNNClassifier wsClassifier;
5051
protected CodekNNClassifier alignClassifier;
5152
protected int k;
5253

@@ -112,13 +113,14 @@ public String format() {
112113
public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
113114
CommonToken curToken = (CommonToken)tokens.get(tokenIndexInStream);
114115
String tokText = curToken.getText();
116+
TerminalNode node = tokenToNodeMap.get(curToken);
115117

116118
emitCommentsToTheLeft(tokenIndexInStream);
117119

118120
int[] features = getNodeFeatures(tokenToNodeMap, doc, tokenIndexInStream, line, tabSize);
119121
// must set "prev end column" value as token stream doesn't have it;
120122
// we're tracking it as we emit tokens
121-
features[INDEX_PREV_END_COLUMN] = charPosInLine;
123+
// features[INDEX_PREV_END_COLUMN] = charPosInLine;
122124

123125
int injectNL_WS = nlwsClassifier.classify(k, features, corpus.injectWhitespace, MAX_CONTEXT_DIFF_THRESHOLD);
124126
int newlines = 0;
@@ -130,23 +132,15 @@ else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) {
130132
ws = CollectFeatures.unwscat(injectNL_WS);
131133
}
132134

133-
// getNodeFeatures() also doesn't know what line curToken is on. If \n, we need to find exemplars that start a line
134-
features[INDEX_FIRST_ON_LINE] = newlines; // use \n prediction to match exemplars for alignment
135-
136-
int align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);
137-
138-
TokenPositionAnalysis tokenPositionAnalysis =
139-
getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws);
140-
analysis.setSize(tokenIndexInStream+1);
141-
analysis.set(tokenIndexInStream, tokenPositionAnalysis);
142-
143135
if ( ws==0 && cannotJoin(realTokens.get(indexIntoRealTokens-1), curToken) ) { // failsafe!
144136
ws = 1;
145137
}
146138

139+
int align = CAT_NO_ALIGNMENT;
140+
147141
if ( newlines>0 ) {
148142
output.append(Tool.newlines(newlines));
149-
line++;
143+
line+=newlines;
150144
charPosInLine = 0;
151145

152146
List<Token> tokensOnPreviousLine = getTokensOnPreviousLine(tokens, tokenIndexInStream, line);
@@ -155,9 +149,15 @@ else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) {
155149
firstTokenOnPrevLine = tokensOnPreviousLine.get(0);
156150
}
157151

158-
TerminalNode node = tokenToNodeMap.get(curToken);
159152
ParserRuleContext parent = (ParserRuleContext)node.getParent();
160153

154+
// getNodeFeatures() doesn't know what line curToken is on. If \n, we need to find exemplars that start a line
155+
features[INDEX_FIRST_ON_LINE] = newlines>0 ? 1 : 0; // use \n prediction to match exemplars for alignment
156+
// if we decide to inject a newline, we better recompute this value before classifying alignment
157+
features[INDEX_MATCHING_TOKEN_DIFF_LINE] = getMatchingSymbolOnDiffLine(doc, node, line);
158+
159+
align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);
160+
161161
if ( align==CAT_INDENT ) {
162162
if ( firstTokenOnPrevLine!=null ) { // if not on first line, we cannot indent
163163
int indentedCol = firstTokenOnPrevLine.getCharPositionInLine()+INDENT_LEVEL;
@@ -169,10 +169,7 @@ else if ( (align&0xFF)==CAT_ALIGN_WITH_ANCESTOR_CHILD ) {
169169
int[] deltaChild = CollectFeatures.unaligncat(align);
170170
int deltaFromAncestor = deltaChild[0];
171171
int childIndex = deltaChild[1];
172-
ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken);
173-
if ( earliestLeftAncestor==null ) {
174-
earliestLeftAncestor = parent;
175-
}
172+
ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken);
176173
ParserRuleContext ancestor = CollectFeatures.getAncestor(earliestLeftAncestor, deltaFromAncestor);
177174
ParseTree child = ancestor.getChild(childIndex);
178175
Token start = null;
@@ -194,10 +191,7 @@ else if ( child instanceof TerminalNode ){
194191
}
195192
else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) {
196193
int deltaFromAncestor = CollectFeatures.unindentcat(align);
197-
ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken);
198-
if ( earliestLeftAncestor==null ) {
199-
earliestLeftAncestor = parent;
200-
}
194+
ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken);
201195
ParserRuleContext ancestor = CollectFeatures.getAncestor(earliestLeftAncestor, deltaFromAncestor);
202196
Token start = ancestor.getStart();
203197
int indentCol = start.getCharPositionInLine() + INDENT_LEVEL;
@@ -211,6 +205,11 @@ else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) {
211205
charPosInLine += ws;
212206
}
213207

208+
TokenPositionAnalysis tokenPositionAnalysis =
209+
getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws);
210+
analysis.setSize(tokenIndexInStream+1);
211+
analysis.set(tokenIndexInStream, tokenPositionAnalysis);
212+
214213
// update Token object with position information now that we are about
215214
// to emit it.
216215
curToken.setLine(line);
@@ -227,6 +226,10 @@ else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) {
227226
/** Look into the token stream to get the comments to the left of current
228227
* token. Emit all whitespace and comments except for whitespace at the
229228
* end as we'll inject that per newline prediction.
229+
*
230+
* This assumes we are grooming not totally reformatting.
231+
* We able to see original input stream for comment purposes. With all
232+
* whitespace removed, we can't emit this stuff properly at moment.
230233
*/
231234
public void emitCommentsToTheLeft(int tokenIndexInStream) {
232235
List<Token> hiddenTokensToLeft = tokens.getHiddenTokensToLeft(tokenIndexInStream);
@@ -270,7 +273,7 @@ public void emitCommentsToTheLeft(int tokenIndexInStream) {
270273

271274
public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealTokens, int tokenIndexInStream,
272275
int injectNewline,
273-
int alignWithPrevious,
276+
int align,
274277
int ws)
275278
{
276279
CommonToken curToken = (CommonToken)tokens.get(tokenIndexInStream);
@@ -286,12 +289,11 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT
286289

287290
boolean prevIsWS = prevToken.getChannel()==Token.HIDDEN_CHANNEL; // assume this means whitespace
288291
int actualNL = Tool.count(prevToken.getText(), '\n');
289-
int actualWS = Tool.count(prevToken.getText(), ' ');
290-
String newlinePredictionString = String.format("### line %d: predicted %d \\n actual %s",
292+
String newlinePredictionString = String.format("### line %d: predicted %d \\n actual ?",
291293
originalCurToken.getLine(), injectNewline, prevIsWS ? actualNL : "none");
292-
String alignPredictionString = String.format("### line %d: predicted %s actual %s",
294+
String alignPredictionString = String.format("### line %d: predicted %d actual %s",
293295
originalCurToken.getLine(),
294-
alignWithPrevious==1?"align":"unaligned",
296+
align,
295297
"?");
296298

297299
String newlineAnalysis = newlinePredictionString+"\n"+

java/src/org/antlr/codebuff/Neighbor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ public String toString(FeatureMetaData[] FEATURES, List<Integer> Y) {
1717
int[] X = corpus.X.get(corpusVectorIndex);
1818
InputDocument doc = corpus.documents.get(corpusVectorIndex);
1919
String features = CollectFeatures._toString(FEATURES, doc, X);
20-
int line = CollectFeatures.getInfoLine(X);
20+
int line = X[CollectFeatures.INDEX_INFO_LINE];
2121
String lineText = doc.getLine(line);
2222
int col = X[CollectFeatures.INDEX_INFO_CHARPOS];
2323
// insert a dot right before char position

java/src/org/antlr/codebuff/kNNClassifier.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,8 @@ public Neighbor[] kNN(int[] unknown, int k, double distanceThreshold) {
125125
}
126126

127127
public Neighbor[] distances(int[] unknown, double distanceThreshold) {
128-
int curTokenRuleIndex = unknown[CollectFeatures.INDEX_RULE];
129-
int prevTokenRuleIndex = unknown[CollectFeatures.INDEX_PREV_RULE];
128+
int curTokenRuleIndex = unknown[CollectFeatures.INDEX_PREV_EARLIEST_RIGHT_ANCESTOR];
129+
int prevTokenRuleIndex = unknown[CollectFeatures.INDEX_EARLIEST_LEFT_ANCESTOR];
130130
int pr = CollectFeatures.unrulealt(prevTokenRuleIndex)[0];
131131
int cr = CollectFeatures.unrulealt(curTokenRuleIndex)[0];
132132
Pair<Integer, Integer> key = new Pair<>(pr, cr);

0 commit comments

Comments
 (0)