@@ -3,9 +3,7 @@ package net.seesharpsoft.intellij.plugins.csv;
33import com.intellij.psi.tree.IElementType ;
44import net.seesharpsoft.intellij.plugins.csv.psi.CsvTypes ;
55import com.intellij.psi.TokenType ;
6- import com.intellij.lexer.FlexLexer ;import org.intellij.grammar.livePreview.LivePreviewElementType ;
7-
8- import java.util.regex.Pattern ;
6+ import com.intellij.lexer.FlexLexer ;
97
108% %
119
@@ -17,120 +15,125 @@ import java.util.regex.Pattern;
1715%{
1816 private CsvValueSeparator myValueSeparator;
1917 private CsvEscapeCharacter myEscapeCharacter;
18+ private boolean mySupportComments;
2019
21- private static final Pattern ESCAPE_TEXT_PATTERN = Pattern . compile(" [,:;|\\ t\\ r\\ n]" );
20+ private boolean isActualValueSeparator() {
21+ return myValueSeparator. isValueSeparator(yytext(). toString());
22+ }
2223
2324 /**
24- * Provide constructor that supports a Project as parameter .
25+ * Provide constructor that support parameters to customize lexer .
2526 */
26- CsvLexer(java.io. Reader in, CsvValueSeparator valueSeparator, CsvEscapeCharacter escapeCharacter) {
27+ CsvLexer(java.io. Reader in, CsvValueSeparator valueSeparator, CsvEscapeCharacter escapeCharacter, boolean supportComments ) {
2728 this (in);
2829 myValueSeparator = valueSeparator;
2930 myEscapeCharacter = escapeCharacter;
31+ mySupportComments = supportComments;
3032 }
3133%}
3234%eof{ return ;
3335%eof}
3436
35- TEXT = [^ ,:;|\t\r\n \"\\ ] +
36- ESCAPED_TEXT = [ ,:;|\t\r\n] | \"\" | \\\"
37- ESCAPE_CHAR = \\
38- QUOTE = \"
39- COMMA = [ ,:;|\t]
40- EOL =\n
4137WHITE_SPACE = [ \f] +
38+ VALUE_SEPARATOR = [ ,:;|\t]
39+ RECORD_SEPARATOR =\n
40+ ESCAPED_QUOTE = \"\" | \\\"
41+ QUOTE = \"
42+ TEXT = [^ ,:;|\t\r\n \"\\ ] +
43+ BACKSLASH = \\ +
44+ COMMENT = \# [^\n] *
4245
43- %state AFTER_TEXT
44- %state ESCAPED_TEXT
45- %state UNESCAPED_TEXT
46- %state ESCAPING
46+ %state UNQUOTED
47+ %state QUOTED
4748
4849%%
4950
50- <YYINITIAL> {QUOTE }
51+ <YYINITIAL, UNQUOTED > {TEXT }
5152{
52- yybegin(ESCAPED_TEXT );
53- return CsvTypes . QUOTE ;
53+ yybegin(UNQUOTED );
54+ return CsvTypes . TEXT ;
5455}
5556
56- <ESCAPED_TEXT > {QUOTE }
57+ <YYINITIAL, UNQUOTED > {BACKSLASH }
5758{
58- yybegin(AFTER_TEXT );
59- return CsvTypes . QUOTE ;
59+ yybegin(UNQUOTED );
60+ return CsvTypes . TEXT ;
6061}
6162
62- <YYINITIAL> {TEXT }
63+ <YYINITIAL, UNQUOTED > {VALUE_SEPARATOR }
6364{
64- yybegin(UNESCAPED_TEXT );
65+ yybegin(UNQUOTED );
66+ if (isActualValueSeparator()) {
67+ return CsvTypes . COMMA ;
68+ }
6569 return CsvTypes . TEXT ;
6670}
6771
68- <UNESCAPED_TEXT, ESCAPED_TEXT > {TEXT }
72+ <YYINITIAL, UNQUOTED > {QUOTE }
6973{
70- return CsvTypes . TEXT ;
74+ yybegin(QUOTED );
75+ return CsvTypes . QUOTE ;
7176}
7277
73- <YYINITIAL, UNESCAPED_TEXT > {ESCAPE_CHAR }
78+ <YYINITIAL, UNQUOTED > {RECORD_SEPARATOR }
7479{
75- String text = yytext(). toString();
76- if (myEscapeCharacter. getCharacter(). equals(text)) {
77- return TokenType . BAD_CHARACTER ;
80+ yybegin(YYINITIAL );
81+ return CsvTypes . CRLF ;
82+ }
83+
84+ <YYINITIAL> {COMMENT}
85+ {
86+ if (mySupportComments) {
87+ return CsvTypes . COMMENT ;
7888 }
79- yybegin(UNESCAPED_TEXT );
89+ yypushback(yylength() - 1 );
90+ yybegin(UNQUOTED );
8091 return CsvTypes . TEXT ;
8192}
8293
83- <ESCAPED_TEXT, ESCAPING> {ESCAPE_CHAR} {
84- String text = yytext(). toString();
85- if (myEscapeCharacter. getCharacter(). equals(text)) {
86- switch (yystate()) {
87- case ESCAPED_TEXT :
88- yybegin(ESCAPING );
89- break ;
90- case ESCAPING :
91- yybegin(ESCAPED_TEXT );
92- break ;
93- default :
94- throw new RuntimeException (" unhandled state: " + yystate());
94+ <QUOTED> {TEXT}
95+ {
96+ return CsvTypes . TEXT ;
97+ }
98+
99+ <QUOTED> {BACKSLASH}
100+ {
101+ if (myEscapeCharacter == CsvEscapeCharacter . BACKSLASH ) {
102+ int backslashCount = yylength();
103+ if (backslashCount > 1 && (backslashCount % 2 != 0 )) {
104+ yypushback(1 );
95105 }
96- return CsvTypes . ESCAPED_TEXT ;
97106 }
98107 return CsvTypes . TEXT ;
99108}
100109
101- <ESCAPED_TEXT > {ESCAPED_TEXT }
110+ <QUOTED > {RECORD_SEPARATOR }
102111{
103- String text = yytext(). toString();
104- if (myEscapeCharacter. isEscapedQuote(text)
105- || ESCAPE_TEXT_PATTERN . matcher(text). matches()
106- ) {
112+ return CsvTypes . ESCAPED_TEXT ;
113+ }
114+
115+ <QUOTED> {VALUE_SEPARATOR}
116+ {
117+ if (isActualValueSeparator()) {
107118 return CsvTypes . ESCAPED_TEXT ;
108119 }
109- if (! text. startsWith(CsvEscapeCharacter . QUOTE. getCharacter())) {
110- yypushback(1 );
111- return CsvTypes . TEXT ;
112- }
113-
114- return TokenType . BAD_CHARACTER ;
120+ return CsvTypes . TEXT ;
115121}
116122
117- <YYINITIAL, AFTER_TEXT, UNESCAPED_TEXT > {COMMA }
123+ <QUOTED > {ESCAPED_QUOTE }
118124{
119- if (myValueSeparator. isValueSeparator(yytext(). toString())) {
120- yybegin(YYINITIAL );
121- return CsvTypes . COMMA ;
122- }
123- if (yystate() != AFTER_TEXT ) {
124- yybegin(UNESCAPED_TEXT );
125+ String text = yytext(). toString();
126+ if (! myEscapeCharacter. isEscapedQuote(text)) {
127+ yypushback(1 );
125128 return CsvTypes . TEXT ;
126129 }
127- return TokenType . BAD_CHARACTER ;
130+ return CsvTypes . ESCAPED_TEXT ;
128131}
129132
130- <YYINITIAL, AFTER_TEXT, UNESCAPED_TEXT > {EOL }
133+ <QUOTED > {QUOTE }
131134{
132- yybegin(YYINITIAL );
133- return CsvTypes . CRLF ;
135+ yybegin(UNQUOTED );
136+ return CsvTypes . QUOTE ;
134137}
135138
136139{WHITE_SPACE}
0 commit comments