Skip to content

Commit 84792f3

Browse files
committed
Fix Haskell bugs: numbers, nested comments, dangling span...
... case-sensitive xref. Also: - Escape all HTML special characters in HaskellXref. - Use Common.xref in HaskellSymbolTokenizer.
1 parent 2261682 commit 84792f3

File tree

6 files changed

+266
-95
lines changed

6 files changed

+266
-95
lines changed
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2015, 2016, Oracle and/or its affiliates. All rights reserved.
22+
* Copyright (c) 2017, Chris Fraire <[email protected]>.
23+
*
24+
* Copyright (c) Simon Peyton Jones.
25+
* Copyright (c) Simon Marlow.
26+
* The authors and publisher intend this Report to belong to the entire Haskell
27+
* community, and grant permission to copy and distribute it for any purpose,
28+
* provided that it is reproduced in its entirety, including this Notice.
29+
* Modified versions of this Report may also be copied and distributed for any
30+
* purpose, provided that the modified version is clearly presented as such,
31+
* and that it does not claim to be a definition of the language Haskell 2010.
32+
*/
33+
34+
Identifier = ({varid} | {conid})
35+
/*
36+
* varid → (small {small | large | digit | ' })⟨reservedid⟩
37+
* ; N.b. "except {reservedid} is excluded from OpenGrok's varid definition
38+
*/
39+
varid = {small} ({small} | {large} | {digit} | [\'])*
40+
/*
41+
* conid → large {small | large | digit | ' }
42+
*/
43+
conid = {large} ({small} | {large} | {digit} | [\'])*
44+
/*
45+
* small → ascSmall | uniSmall | _
46+
* ascSmall → a | b | … | z
47+
* uniSmall → any Unicode lowercase letter
48+
*/
49+
small = [a-z\p{Ll}_]
50+
/*
51+
* large → ascLarge | uniLarge
52+
* ascLarge → A | B | … | Z
53+
* uniLarge → any uppercase or titlecase Unicode letter
54+
*/
55+
large = [A-Z\p{Lu}\p{Lt}]
56+
/*
57+
* digit → ascDigit | uniDigit
58+
* ascDigit → 0 | 1 | … | 9
59+
* uniDigit → any Unicode decimal digit
60+
* octit → 0 | 1 | … | 7
61+
* hexit → digit | A | … | F | a | … | f
62+
*/
63+
digit = [0-9\p{Nd}]
64+
octit = [0-7]
65+
hexit = [0-9\p{Nd}A-Fa-f]
66+
67+
Number = ({integer} | {float})
68+
/*
69+
* decimal → digit{digit}
70+
* octal → octit{octit}
71+
* hexadecimal → hexit{hexit}
72+
*/
73+
decimal = {digit}+
74+
octal = {octit}+
75+
hexadecimal = {hexit}+
76+
/*
77+
*
78+
* integer → decimal
79+
* | 0o octal | 0O octal
80+
* | 0x hexadecimal | 0X hexadecimal
81+
*/
82+
integer = ({decimal} | [0][oO]{octal} | [0][xX]{hexadecimal})
83+
/*
84+
* float → decimal . decimal [exponent]
85+
* | decimal exponent
86+
*/
87+
float = ({decimal} [\.] {decimal} {exponent}? |
88+
{decimal} {exponent})
89+
/*
90+
* exponent → (e | E) [+ | -] decimal
91+
*/
92+
exponent = [eE] [\+\-]? {decimal}
93+
94+
/*
95+
* "For example, '-->' or '|--' do not begin a comment, because both of these
96+
* are legal lexemes;"
97+
*/
98+
NotComments = ("-->" | "|--")

src/org/opensolaris/opengrok/analysis/haskell/HaskellSymbolTokenizer.lex

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,13 @@
2727
*/
2828

2929
package org.opensolaris.opengrok.analysis.haskell;
30+
31+
import java.io.IOException;
3032
import org.opensolaris.opengrok.analysis.JFlexTokenizer;
3133

3234
/**
3335
* @author Harry Pan
3436
*/
35-
3637
%%
3738
%public
3839
%class HaskellSymbolTokenizer
@@ -44,12 +45,19 @@ super(in);
4445
%int
4546
%include CommonTokenizer.lexh
4647
%char
48+
%{
49+
private int nestedComment;
4750

48-
Identifier = [a-zA-Z_] [a-zA-Z0-9_']*
49-
Number = (0[xX][0-9a-fA-F]+|0[oO][0-7]+|[0-9]+\.[0-9]+|[0-9][0-9_]*)([eE][+-]?[0-9]+)?
51+
public void reset() throws IOException {
52+
super.reset();
53+
nestedComment = 0;
54+
}
55+
%}
5056

5157
%state STRING CHAR COMMENT BCOMMENT
5258

59+
%include Common.lexh
60+
%include Haskell.lexh
5361
%%
5462

5563
<YYINITIAL> {
@@ -64,26 +72,40 @@ Number = (0[xX][0-9a-fA-F]+|0[oO][0-7]+|[0-9]+\.[0-9]+|[0-9][0-9_]*)([eE][+-]?[0
6472
\" { yybegin(STRING); }
6573
\' { yybegin(CHAR); }
6674
"--" { yybegin(COMMENT); }
67-
"{-" { yybegin(BCOMMENT); }
75+
76+
{NotComments} {}
6877
}
6978

7079
<STRING> {
80+
\\[\"\\] {}
7181
\" { yybegin(YYINITIAL); }
72-
\\\" {} // escaped double quote - don't do anything
7382
}
7483

7584
<CHAR> { // we don't need to consider the case where prime is part of an identifier since it is handled above
85+
\\[\'\\] {}
7686
\' { yybegin(YYINITIAL); }
77-
\\\' {} // escaped single quote - don't do anything
7887
}
7988

8089
<COMMENT> {
81-
\n { yybegin(YYINITIAL); }
90+
{EOL} { yybegin(YYINITIAL); }
91+
}
92+
93+
<YYINITIAL, BCOMMENT> {
94+
"{-" {
95+
if (nestedComment++ == 0) {
96+
yybegin(BCOMMENT);
97+
}
98+
}
8299
}
83100

84101
<BCOMMENT> {
85-
"-}" { yybegin(YYINITIAL); }
102+
"-}" {
103+
if (--nestedComment == 0) {
104+
yybegin(YYINITIAL);
105+
}
106+
}
86107
}
87108

88109
// fallback
110+
{WhiteSpace} |
89111
[^] {}

src/org/opensolaris/opengrok/analysis/haskell/HaskellXref.lex

Lines changed: 74 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,73 +27,124 @@
2727
*/
2828

2929
package org.opensolaris.opengrok.analysis.haskell;
30-
import org.opensolaris.opengrok.analysis.JFlexXref;
30+
31+
import org.opensolaris.opengrok.analysis.JFlexXrefSimple;
32+
import org.opensolaris.opengrok.web.HtmlConsts;
3133
import org.opensolaris.opengrok.web.Util;
3234

3335
/**
3436
* @author Harry Pan
3537
*/
36-
3738
%%
3839
%public
3940
%class HaskellXref
40-
%extends JFlexXref
41+
%extends JFlexXrefSimple
4142
%unicode
42-
%ignorecase
4343
%int
4444
%include CommonXref.lexh
4545
%{
46+
private int nestedComment;
47+
48+
@Override
49+
public void reset() {
50+
super.reset();
51+
nestedComment = 0;
52+
}
53+
4654
// TODO move this into an include file when bug #16053 is fixed
4755
@Override
4856
protected int getLineNumber() { return yyline; }
4957
@Override
5058
protected void setLineNumber(int x) { yyline = x; }
5159
%}
5260

53-
Identifier = [a-zA-Z_] [a-zA-Z0-9_']*
54-
Number = (0[xX][0-9a-fA-F]+|0[oO][0-7]+|[0-9]+\.[0-9]+|[0-9][0-9_]*)([eE][+-]?[0-9]+)?
55-
5661
%state STRING CHAR COMMENT BCOMMENT
5762

5863
%include Common.lexh
5964
%include CommonURI.lexh
6065
%include CommonPath.lexh
66+
%include Haskell.lexh
6167
%%
6268
<YYINITIAL> {
6369
{Identifier} {
6470
String id = yytext();
6571
writeSymbol(id, Consts.kwd, yyline);
6672
}
67-
{Number} { out.write("<span class=\"n\">"); out.write(yytext()); out.write("</span>"); }
68-
\" { yybegin(STRING); out.write("<span class=\"s\">\""); }
69-
\' { yybegin(CHAR); out.write("<span class=\"s\">\'"); }
70-
"--" { yybegin(COMMENT); out.write("<span class=\"c\">--"); }
71-
"{-" { yybegin(BCOMMENT); out.write("<span class=\"c\">{-"); }
73+
{Number} {
74+
disjointSpan(HtmlConsts.NUMBER_CLASS);
75+
out.write(yytext());
76+
disjointSpan(null);
77+
}
78+
\" {
79+
pushSpan(STRING, HtmlConsts.STRING_CLASS);
80+
out.write(htmlize(yytext()));
81+
}
82+
\' {
83+
pushSpan(CHAR, HtmlConsts.STRING_CLASS);
84+
out.write(htmlize(yytext()));
85+
}
86+
"--" {
87+
pushSpan(COMMENT, HtmlConsts.COMMENT_CLASS);
88+
out.write(yytext());
89+
}
90+
91+
{NotComments} { out.write(yytext()); }
7292
}
7393

7494
<STRING> {
75-
\" { yybegin(YYINITIAL); out.write("\"</span>"); }
76-
\\\\ { out.write("\\\\"); }
77-
\\\" { out.write("\\\""); }
78-
{WhspChar}*{EOL} { yybegin(YYINITIAL); out.write("</span>"); startNewLine(); }
95+
\\[\"\\] { out.write(htmlize(yytext())); }
96+
\" {
97+
out.write(htmlize(yytext()));
98+
yypop();
99+
}
100+
/*
101+
* "A string may include a 'gap'-—two backslants enclosing white
102+
* characters—-which is ignored. This allows one to write long strings on
103+
* more than one line by writing a backslant at the end of one line and at
104+
* the start of the next." N.b. OpenGrok does not explicltly recognize the
105+
* "gap" but since a STRING must end in a non-escaped quotation mark, just
106+
* allow STRINGs to be multi-line regardless of syntax.
107+
*/
79108
}
80109

81110
<CHAR> { // we don't need to consider the case where prime is part of an identifier since it is handled above
82-
( .\' | \\.\' ) { yybegin(YYINITIAL); out.write(yytext()); out.write("</span>"); }
83-
{WhspChar}*{EOL} { yybegin(YYINITIAL); out.write("</span>"); startNewLine(); }
111+
\\[\'\\] { out.write(htmlize(yytext())); }
112+
\' {
113+
out.write(htmlize(yytext()));
114+
yypop();
115+
}
116+
/*
117+
* N.b. though only a single char is valid Haskell syntax, OpenGrok just
118+
* waits to end CHAR at a non-escaped apostrophe regardless of count.
119+
*/
84120
}
85121

86122
<COMMENT> {
87-
{WhspChar}*{EOL} { yybegin(YYINITIAL); out.write("</span>"); startNewLine(); }
123+
{WhspChar}*{EOL} {
124+
yypop();
125+
startNewLine();
126+
}
127+
}
128+
129+
<YYINITIAL, BCOMMENT> {
130+
"{-" {
131+
if (nestedComment++ == 0) {
132+
pushSpan(BCOMMENT, HtmlConsts.COMMENT_CLASS);
133+
}
134+
out.write(yytext());
135+
}
88136
}
89137

90138
<BCOMMENT> {
91-
"-}" { yybegin(YYINITIAL); out.write("-}</span>"); }
139+
"-}" {
140+
out.write(yytext());
141+
if (--nestedComment == 0) {
142+
yypop();
143+
}
144+
}
92145
}
93146

94-
"&" { out.write( "&amp;"); }
95-
"<" { out.write( "&lt;"); }
96-
">" { out.write( "&gt;"); }
147+
[&<>\'\"] { out.write(htmlize(yytext())); }
97148
{WhspChar}*{EOL} { startNewLine(); }
98149
{WhiteSpace} { out.write(yytext()); }
99150
[!-~] { out.write(yycharat(0)); }

test/org/opensolaris/opengrok/analysis/haskell/HaskellXrefTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ public void basicTest() throws IOException {
5959
"<a class=\"l\" name=\"1\" href=\"#1\">1</a>" +
6060
"<a href=\"/source/s?defs=putStrLn\" class=\"intelliWindow-symbol\"" +
6161
" data-definition-place=\"undefined-in-file\">putStrLn</a>" +
62-
" <span class=\"s\">\"Hello, world!\"</span>\n",
62+
" <span class=\"s\">&quot;Hello, world!&quot;</span>\n",
6363
w.toString());
6464
}
6565

0 commit comments

Comments
 (0)