Skip to content

Commit 414141f

Browse files
committed
[ #71 ] Emit warnings for nullable tokens.
The parser for .x files will now generate warnings for rules whose regular expression is nullable, i.e., matches the empty input. Exceptions are: 1. The user wrote regular expression (). In this case they probably meant to match the empty input. 2. Startcodes are involved. Then, matching the empty input might still change the lexer state, thus, the lexer might not continue to match the empty input. Warnings are kept as a stack in the state of the parse monad, and printed upon successful completion of the parser in Main. Example (excerpt of `issue_71.x`): ``` $whitespace = [\ \n\t] @whitespaces = $whitespace* :- @whitespaces { \ _ _ -> Whitespaces } ^ Warning here! Warning: issue_71.x:24:14: Regular expression [..]* matches the empty string, but will be interpreted as not matching the empty string. ``` Since the parser does not generate abstract syntax with position information, it is hard to give the exact warning location, i.e., the location of the offending regular expression. To keep the changes minimal, we record the location of the token _after_ the regular expression, i.e., the location where the code part begins. This approximate location should be good enough to jump to the regex in question. Another problem is that the exact text of the regular expression is not printed in the warning, only what `Show RExp` gives us. This could be fixed if we had the exact location information of the regular expression; we could then cut the regular expression out of the input string. Alternatively, the parser could be modified to return a _concrete_ syntax for the regular expression from which its original text could be recovered. The abstract reg.ex. would then be built from the concrete one in a second step. At this point, I abstain from such invasive changes to Alex for the sake of improving this rare warning.
1 parent d04ca8b commit 414141f

File tree

4 files changed

+101
-37
lines changed

4 files changed

+101
-37
lines changed

src/AbsSyn.hs

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ module AbsSyn (
1414
wrapperName,
1515
Scanner(..),
1616
RECtx(..),
17-
RExp(..),
17+
RExp(..), nullable,
1818
DFA(..), State(..), SNum, StartCode, Accept(..),
1919
RightContext(..), showRCtx, strtype,
2020
encodeStartCodes, extractActions,
@@ -188,21 +188,21 @@ usesPreds dfa
188188
-- Regular expressions
189189

190190
-- `RExp' provides an abstract syntax for regular expressions. `Eps' will
191-
-- match empty strings; `Ch p' matches strings containinng a single character
191+
-- match empty strings; `Ch p' matches strings containing a single character
192192
-- `c' if `p c' is true; `re1 :%% re2' matches a string if `re1' matches one of
193193
-- its prefixes and `re2' matches the rest; `re1 :| re2' matches a string if
194194
-- `re1' or `re2' matches it; `Star re', `Plus re' and `Ques re' can be
195195
-- expressed in terms of the other operators. See the definitions of `ARexp'
196196
-- for a formal definition of the semantics of these operators.
197197

198198
data RExp
199-
= Eps
200-
| Ch CharSet
201-
| RExp :%% RExp
202-
| RExp :| RExp
203-
| Star RExp
204-
| Plus RExp
205-
| Ques RExp
199+
= Eps -- ^ Empty.
200+
| Ch CharSet -- ^ Singleton.
201+
| RExp :%% RExp -- ^ Sequence.
202+
| RExp :| RExp -- ^ Alternative.
203+
| Star RExp -- ^ Zero or more repetitions.
204+
| Plus RExp -- ^ One or more repetitions.
205+
| Ques RExp -- ^ Zero or one repetitions.
206206

207207
instance Show RExp where
208208
showsPrec _ Eps = showString "()"
@@ -213,6 +213,17 @@ instance Show RExp where
213213
showsPrec _ (Plus r) = shows r . ('+':)
214214
showsPrec _ (Ques r) = shows r . ('?':)
215215

216+
-- | A regular expression is nullable if it matches the empty string.
217+
nullable :: RExp -> Bool
218+
nullable Eps = True
219+
nullable Ch{} = False
220+
nullable (l :%% r) = nullable l && nullable r
221+
nullable (l :| r) = nullable l || nullable r
222+
nullable Star{} = True
223+
nullable (Plus r) = nullable r
224+
nullable Ques{} = True
225+
226+
216227
{------------------------------------------------------------------------------
217228
Abstract Regular Expression
218229
------------------------------------------------------------------------------}

src/Main.hs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import Info
1818
import Map ( Map )
1919
import qualified Map hiding ( Map )
2020
import Output
21-
import ParseMonad ( runP )
21+
import ParseMonad ( runP, Warning(..) )
2222
import Parser
2323
import Scan
2424
import Util ( hline )
@@ -114,11 +114,20 @@ parseScript file prg =
114114
Left (Nothing, err) ->
115115
die (file ++ ": " ++ err ++ "\n")
116116

117-
Right script@(_, _, scanner, _) -> do
117+
Right (warnings, script@(_, _, scanner, _)) -> do
118118
-- issue 46: give proper error when lexer definition is empty
119119
when (null $ scannerTokens scanner) $
120120
dieAlex $ file ++ " contains no lexer rules\n"
121+
-- issue 71: warn about nullable regular expressions
122+
mapM_ printWarning warnings
121123
return script
124+
where
125+
printWarning (WarnNullableRExp (AlexPn _ line col) msg) =
126+
hPutStrLn stderr $ concat
127+
[ "Warning: "
128+
, file , ":", show line , ":" , show col , ": "
129+
, msg
130+
]
122131

123132
alex :: [CLIFlags]
124133
-> FilePath

src/ParseMonad.hs

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
module ParseMonad (
1010
AlexInput, alexInputPrevChar, alexGetChar, alexGetByte,
1111
AlexPosn(..), alexStartPos,
12-
12+
Warning(..), warnIfNullable,
1313
P, runP, StartCode, failP, lookupSMac, lookupRMac, newSMac, newRMac,
1414
setStartCode, getStartCode, getInput, setInput,
1515
) where
@@ -23,7 +23,7 @@ import UTF8
2323
#if __GLASGOW_HASKELL__ < 710
2424
import Control.Applicative ( Applicative(..) )
2525
#endif
26-
import Control.Monad ( liftM, ap )
26+
import Control.Monad ( liftM, ap, when )
2727
import Data.Word (Word8)
2828
-- -----------------------------------------------------------------------------
2929
-- The input type
@@ -57,7 +57,7 @@ alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c
5757
-- Token positions
5858

5959
-- `Posn' records the location of a token in the input text. It has three
60-
-- fields: the address (number of chacaters preceding the token), line number
60+
-- fields: the address (number of charaters preceding the token), line number
6161
-- and column of a token within the file. `start_pos' gives the position of the
6262
-- start of the file and `eof_pos' a standard encoding for the end of file.
6363
-- `move_pos' calculates the new position after traversing a given character,
@@ -77,15 +77,22 @@ alexMove (AlexPn a l c) _ = AlexPn (a+1) l (c+1)
7777
-- -----------------------------------------------------------------------------
7878
-- Alex lexing/parsing monad
7979

80+
data Warning
81+
= WarnNullableRExp
82+
{ _warnPos :: AlexPosn -- ^ The position of the code following the regex.
83+
, _warnText :: String -- ^ Warning text.
84+
}
85+
8086
type ParseError = (Maybe AlexPosn, String)
8187
type StartCode = Int
8288

83-
data PState = PState {
84-
smac_env :: Map String CharSet,
85-
rmac_env :: Map String RExp,
86-
startcode :: Int,
87-
input :: AlexInput
88-
}
89+
data PState = PState
90+
{ warnings :: [Warning] -- ^ Stack of warnings, top = last warning.
91+
, smac_env :: Map String CharSet
92+
, rmac_env :: Map String RExp
93+
, startcode :: Int
94+
, input :: AlexInput
95+
}
8996

9097
newtype P a = P { unP :: PState -> Either ParseError (PState,a) }
9198

@@ -102,15 +109,27 @@ instance Monad P where
102109
Right (env',ok) -> unP (k ok) env'
103110
return = pure
104111

105-
runP :: String -> (Map String CharSet, Map String RExp)
106-
-> P a -> Either ParseError a
107-
runP str (senv,renv) (P p)
112+
-- | Run the parser on given input.
113+
runP :: String
114+
-- ^ Input string.
115+
-> (Map String CharSet, Map String RExp)
116+
-- ^ Character set and regex definitions.
117+
-> P a
118+
-- ^ Parsing computation.
119+
-> Either ParseError ([Warning], a)
120+
-- ^ List of warnings in first-to-last order, result.
121+
runP str (senv,renv) (P p)
108122
= case p initial_state of
109123
Left err -> Left err
110-
Right (_,a) -> Right a
111-
where initial_state =
112-
PState{ smac_env=senv, rmac_env=renv,
113-
startcode = 0, input=(alexStartPos,'\n',[],str) }
124+
Right (s, a) -> Right (reverse (warnings s), a)
125+
where
126+
initial_state = PState
127+
{ warnings = []
128+
, smac_env = senv
129+
, rmac_env = renv
130+
, startcode = 0
131+
, input = (alexStartPos, '\n', [], str)
132+
}
114133

115134
failP :: String -> P a
116135
failP str = P $ \PState{ input = (p,_,_,_) } -> Left (Just p,str)
@@ -152,3 +171,21 @@ getInput = P $ \s -> Right (s, input s)
152171

153172
setInput :: AlexInput -> P ()
154173
setInput inp = P $ \s -> Right (s{ input = inp }, ())
174+
175+
-- | Add a warning if given regular expression is nullable
176+
-- unless the user wrote the regex 'Eps'.
177+
warnIfNullable
178+
:: RExp -- ^ Regular expression.
179+
-> AlexPosn -- ^ Position associated to regular expression.
180+
-> P ()
181+
-- If the user wrote @()@, they wanted to match the empty sequence!
182+
-- Thus, skip the warning then.
183+
warnIfNullable Eps _ = return ()
184+
warnIfNullable r pos = when (nullable r) $ P $ \ s ->
185+
Right (s{ warnings = WarnNullableRExp pos w : warnings s}, ())
186+
where
187+
w = unwords
188+
[ "Regular expression"
189+
, show r
190+
, "matches the empty string."
191+
]

src/Parser.y

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -109,16 +109,23 @@ tokendefs :: { [RECtx] }
109109
| {- empty -} { [] }
110110
111111
tokendef :: { [RECtx] }
112-
: startcodes rule { [ replaceCodes $1 $2 ] }
112+
: startcodes rule { [ replaceCodes $1 (snd $2) ] }
113113
| startcodes '{' rules '}' { map (replaceCodes $1) $3 }
114-
| rule { [ $1 ] }
115-
116-
rule :: { RECtx }
117-
: context rhs { let (l,e,r) = $1 in
118-
RECtx [] l e r $2 }
114+
| rule {% do
115+
let (pos, res@(RECtx _ _ e _ _)) = $1
116+
warnIfNullable e pos
117+
return [ res ]
118+
}
119+
120+
rule :: { (AlexPosn, RECtx) }
121+
: context rhs { let
122+
(l, e, r) = $1
123+
(pos, code) = $2
124+
in (pos, RECtx [] l e r code)
125+
}
119126
120127
rules :: { [RECtx] }
121-
: rule rules { $1 : $2 }
128+
: rule rules { snd $1 : $2 }
122129
| {- empty -} { [] }
123130
124131
startcodes :: { [(String,StartCode)] }
@@ -132,9 +139,9 @@ startcode :: { String }
132139
: ZERO { "0" }
133140
| ID { $1 }
134141
135-
rhs :: { Maybe Code }
136-
: CODE { case $1 of T _ (CodeT code) -> Just code }
137-
| ';' { Nothing }
142+
rhs :: { (AlexPosn, Maybe Code) }
143+
: CODE { case $1 of T pos (CodeT code) -> (pos, Just code) }
144+
| ';' { (tokPosn $1, Nothing) }
138145
139146
context :: { Maybe CharSet, RExp, RightContext RExp }
140147
: left_ctx rexp right_ctx { (Just $1,$2,$3) }

0 commit comments

Comments
 (0)