55import net .seesharpsoft .UnhandledSwitchCaseException ;
66import net .seesharpsoft .commons .util .Tokenizer ;
77import net .seesharpsoft .intellij .plugins .csv .psi .CsvTypes ;
8+ import net .seesharpsoft .intellij .plugins .csv .settings .CsvEditorSettings ;
89import org .jetbrains .annotations .NotNull ;
910import org .jetbrains .annotations .Nullable ;
1011
1819public class CsvSharpLexer extends LexerBase {
1920
2021 private final Tokenizer <TokenType > tokenizer ;
22+ private final List <Tokenizer .Token <TokenType >> initialNextStateTokens ;
2123 private final List <Tokenizer .Token <TokenType >> unquotedNextStateTokens ;
2224 private final List <Tokenizer .Token <TokenType >> quotedNextStateTokens ;
2325
@@ -29,23 +31,34 @@ public class CsvSharpLexer extends LexerBase {
2931 private IElementType currentTokenType ;
3032 private boolean failed ;
3133
34+ private static final Map <TokenType , LexerState > INITIAL_NEXT_STATES = new HashMap <>();
3235 private static final Map <TokenType , LexerState > UNQUOTED_NEXT_STATES = new HashMap <>();
3336 private static final Map <TokenType , LexerState > QUOTED_NEXT_STATES = new HashMap <>();
3437
3538 static {
39+ INITIAL_NEXT_STATES .put (TokenType .WHITESPACE , LexerState .Initial );
40+ INITIAL_NEXT_STATES .put (TokenType .TEXT , LexerState .Unquoted );
41+ INITIAL_NEXT_STATES .put (TokenType .VALUE_SEPARATOR , LexerState .Unquoted );
42+ INITIAL_NEXT_STATES .put (TokenType .BEGIN_QUOTE , LexerState .Quoted );
43+ INITIAL_NEXT_STATES .put (TokenType .RECORD_SEPARATOR , LexerState .Initial );
44+ INITIAL_NEXT_STATES .put (TokenType .COMMENT , LexerState .Initial );
45+
3646 UNQUOTED_NEXT_STATES .put (TokenType .WHITESPACE , LexerState .Unquoted );
3747 UNQUOTED_NEXT_STATES .put (TokenType .TEXT , LexerState .Unquoted );
48+ UNQUOTED_NEXT_STATES .put (TokenType .COMMENT_CHARACTER , LexerState .Unquoted );
3849 UNQUOTED_NEXT_STATES .put (TokenType .VALUE_SEPARATOR , LexerState .Unquoted );
39- UNQUOTED_NEXT_STATES .put (TokenType .RECORD_SEPARATOR , LexerState .Unquoted );
4050 UNQUOTED_NEXT_STATES .put (TokenType .BEGIN_QUOTE , LexerState .Quoted );
51+ UNQUOTED_NEXT_STATES .put (TokenType .RECORD_SEPARATOR , LexerState .Initial );
4152
4253 QUOTED_NEXT_STATES .put (TokenType .WHITESPACE , LexerState .Quoted );
4354 QUOTED_NEXT_STATES .put (TokenType .TEXT , LexerState .Quoted );
55+ QUOTED_NEXT_STATES .put (TokenType .COMMENT_CHARACTER , LexerState .Quoted );
4456 QUOTED_NEXT_STATES .put (TokenType .ESCAPED_CHARACTER , LexerState .Quoted );
4557 QUOTED_NEXT_STATES .put (TokenType .END_QUOTE , LexerState .Unquoted );
4658 }
4759
4860 enum LexerState {
61+ Initial (INITIAL_NEXT_STATES ),
4962 Unquoted (UNQUOTED_NEXT_STATES ),
5063 Quoted (QUOTED_NEXT_STATES );
5164
@@ -71,22 +84,26 @@ enum TokenType {
7184 ESCAPED_CHARACTER ,
7285 VALUE_SEPARATOR ,
7386 RECORD_SEPARATOR ,
74- WHITESPACE
87+ WHITESPACE ,
88+ COMMENT ,
89+ COMMENT_CHARACTER
7590 }
7691
7792 public static class Configuration {
78- public static final Configuration DEFAULT = new Configuration ("," , "\n " , "\" " , "\" " );
93+ public static final Configuration DEFAULT = new Configuration ("," , "\n " , "\" " , "\" " , "#" );
7994
8095 public String valueSeparator ;
8196 public String recordSeparator ;
8297 public String escapeCharacter ;
8398 public String quoteCharacter ;
99+ public String commentCharacter ;
84100
85- public Configuration (String valueSeparator , String recordSeparator , String escapeCharacter , String quoteCharacter ) {
101+ public Configuration (String valueSeparator , String recordSeparator , String escapeCharacter , String quoteCharacter , String commentCharacter ) {
86102 this .valueSeparator = Pattern .quote (valueSeparator );
87103 this .recordSeparator = Pattern .quote (recordSeparator );
88104 this .escapeCharacter = Pattern .quote (escapeCharacter );
89105 this .quoteCharacter = Pattern .quote (quoteCharacter );
106+ this .commentCharacter = Pattern .quote (commentCharacter );
90107 }
91108 }
92109
@@ -102,17 +119,32 @@ public CsvSharpLexer(Configuration configuration) {
102119 tokenizer .add (TokenType .BEGIN_QUOTE , String .format ("%s" , configuration .quoteCharacter ));
103120 tokenizer .add (TokenType .VALUE_SEPARATOR , configuration .valueSeparator );
104121 tokenizer .add (TokenType .RECORD_SEPARATOR , configuration .recordSeparator );
122+ if (!configuration .commentCharacter .isEmpty ()) {
123+ tokenizer .add (TokenType .COMMENT_CHARACTER , configuration .commentCharacter );
124+ tokenizer .add (TokenType .COMMENT , configuration .commentCharacter + ".*(?=(\n |$))" );
125+ }
105126
106127 if (configuration .escapeCharacter .equals (configuration .quoteCharacter )) {
107128 tokenizer .add (TokenType .END_QUOTE , String .format ("%s(?!%s)" , configuration .quoteCharacter , configuration .quoteCharacter ));
108129 tokenizer .add (TokenType .ESCAPED_CHARACTER , String .format ("(%s%s|%s|%s)+" , configuration .quoteCharacter , configuration .quoteCharacter , configuration .valueSeparator , configuration .recordSeparator ));
109- tokenizer .add (TokenType .TEXT , String .format ("((?!%s)[^ \f %s%s])+" , configuration .valueSeparator , configuration .quoteCharacter , configuration .recordSeparator ));
130+ if (!configuration .commentCharacter .isEmpty ()) {
131+ tokenizer .add (TokenType .TEXT , String .format ("((?!(%s|%s))[^ \f %s%s])+" , configuration .commentCharacter , configuration .valueSeparator , configuration .quoteCharacter , configuration .recordSeparator ));
132+ } else {
133+ tokenizer .add (TokenType .TEXT , String .format ("((?!%s)[^ \f %s%s])+" , configuration .valueSeparator , configuration .quoteCharacter , configuration .recordSeparator ));
134+ }
110135 } else {
111136 tokenizer .add (TokenType .END_QUOTE , String .format ("%s" , configuration .quoteCharacter ));
112137 tokenizer .add (TokenType .ESCAPED_CHARACTER , String .format ("(%s%s|%s%s|%s|%s)+" , configuration .escapeCharacter , configuration .quoteCharacter , configuration .escapeCharacter , configuration .escapeCharacter , configuration .valueSeparator , configuration .recordSeparator ));
113- tokenizer .add (TokenType .TEXT , String .format ("((?!%s)[^ \f %s%s%s])+" , configuration .valueSeparator , configuration .escapeCharacter , configuration .quoteCharacter , configuration .recordSeparator ));
138+ if (!configuration .commentCharacter .isEmpty ()) {
139+ tokenizer .add (TokenType .TEXT , String .format ("((?!(%s|%s))[^ \f %s%s%s])+" , configuration .commentCharacter , configuration .valueSeparator , configuration .escapeCharacter , configuration .quoteCharacter , configuration .recordSeparator ));
140+ } else {
141+ tokenizer .add (TokenType .TEXT , String .format ("((?!%s)[^ \f %s%s%s])+" , configuration .valueSeparator , configuration .escapeCharacter , configuration .quoteCharacter , configuration .recordSeparator ));
142+ }
114143 }
115144
145+ initialNextStateTokens = LexerState .Initial .getPossibleTokens ().stream ()
146+ .map (tokenizer ::getToken )
147+ .collect (Collectors .toList ());
116148 unquotedNextStateTokens = LexerState .Unquoted .getPossibleTokens ().stream ()
117149 .map (tokenizer ::getToken )
118150 .collect (Collectors .toList ());
@@ -126,14 +158,14 @@ public void start(@NotNull CharSequence buffer, int startOffset, int endOffset,
126158 this .buffer = buffer ;
127159 this .tokenStart = this .tokenEnd = startOffset ;
128160 this .bufferEnd = endOffset ;
129- this .currentState = initialState == 0 ? LexerState .Unquoted : LexerState . Quoted ;
161+ this .currentState = LexerState .values ()[ initialState ] ;
130162 this .currentTokenType = null ;
131163 }
132164
133165 @ Override
134166 public int getState () {
135167 locateToken ();
136- return currentState == LexerState . Unquoted ? 0 : 1 ;
168+ return currentState . ordinal () ;
137169 }
138170
139171 @ Nullable
@@ -178,6 +210,19 @@ protected void raiseFailure() {
178210 tokenEnd = bufferEnd ;
179211 }
180212
213+ protected Collection <Tokenizer .Token <TokenType >> getCurrentTokenCollection () {
214+ switch (this .currentState ) {
215+ case Initial :
216+ return initialNextStateTokens ;
217+ case Unquoted :
218+ return unquotedNextStateTokens ;
219+ case Quoted :
220+ return quotedNextStateTokens ;
221+ default :
222+ throw new UnhandledSwitchCaseException (this .currentState );
223+ }
224+ }
225+
181226 protected synchronized void locateToken () {
182227 if (currentTokenType != null ) {
183228 return ;
@@ -193,7 +238,7 @@ protected synchronized void locateToken() {
193238 tokenizer .findToken (buffer ,
194239 tokenStart ,
195240 bufferEnd ,
196- currentState == LexerState . Unquoted ? unquotedNextStateTokens : quotedNextStateTokens ,
241+ getCurrentTokenCollection () ,
197242 null
198243 );
199244
@@ -222,8 +267,12 @@ protected synchronized void locateToken() {
222267 currentTokenType = CsvTypes .COMMA ;
223268 break ;
224269 case TEXT :
270+ case COMMENT_CHARACTER :
225271 currentTokenType = CsvTypes .TEXT ;
226272 break ;
273+ case COMMENT :
274+ currentTokenType = CsvTypes .COMMENT ;
275+ break ;
227276 case WHITESPACE :
228277 currentTokenType = com .intellij .psi .TokenType .WHITE_SPACE ;
229278 break ;
0 commit comments