diff --git a/src/AbsSyn.hs b/src/AbsSyn.hs index 3eb8f6b3..7aca5849 100644 --- a/src/AbsSyn.hs +++ b/src/AbsSyn.hs @@ -14,7 +14,7 @@ module AbsSyn ( wrapperName, Scanner(..), RECtx(..), - RExp(..), + RExp(..), nullable, DFA(..), State(..), SNum, StartCode, Accept(..), RightContext(..), showRCtx, strtype, encodeStartCodes, extractActions, @@ -188,7 +188,7 @@ usesPreds dfa -- Regular expressions -- `RExp' provides an abstract syntax for regular expressions. `Eps' will --- match empty strings; `Ch p' matches strings containinng a single character +-- match empty strings; `Ch p' matches strings containing a single character -- `c' if `p c' is true; `re1 :%% re2' matches a string if `re1' matches one of -- its prefixes and `re2' matches the rest; `re1 :| re2' matches a string if -- `re1' or `re2' matches it; `Star re', `Plus re' and `Ques re' can be @@ -196,13 +196,13 @@ usesPreds dfa -- for a formal definition of the semantics of these operators. data RExp - = Eps - | Ch CharSet - | RExp :%% RExp - | RExp :| RExp - | Star RExp - | Plus RExp - | Ques RExp + = Eps -- ^ Empty. + | Ch CharSet -- ^ Singleton. + | RExp :%% RExp -- ^ Sequence. + | RExp :| RExp -- ^ Alternative. + | Star RExp -- ^ Zero or more repetitions. + | Plus RExp -- ^ One or more repetitions. + | Ques RExp -- ^ Zero or one repetitions. instance Show RExp where showsPrec _ Eps = showString "()" @@ -213,6 +213,17 @@ instance Show RExp where showsPrec _ (Plus r) = shows r . ('+':) showsPrec _ (Ques r) = shows r . ('?':) +-- | A regular expression is nullable if it matches the empty string. +nullable :: RExp -> Bool +nullable Eps = True +nullable Ch{} = False +nullable (l :%% r) = nullable l && nullable r +nullable (l :| r) = nullable l || nullable r +nullable Star{} = True +nullable (Plus r) = nullable r +nullable Ques{} = True + + {------------------------------------------------------------------------------ Abstract Regular Expression ------------------------------------------------------------------------------} diff --git a/src/Main.hs b/src/Main.hs index 1f578733..e35c9992 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -18,7 +18,7 @@ import Info import Map ( Map ) import qualified Map hiding ( Map ) import Output -import ParseMonad ( runP ) +import ParseMonad ( runP, Warning(..) ) import Parser import Scan import Util ( hline ) @@ -114,11 +114,20 @@ parseScript file prg = Left (Nothing, err) -> die (file ++ ": " ++ err ++ "\n") - Right script@(_, _, scanner, _) -> do + Right (warnings, script@(_, _, scanner, _)) -> do -- issue 46: give proper error when lexer definition is empty when (null $ scannerTokens scanner) $ dieAlex $ file ++ " contains no lexer rules\n" + -- issue 71: warn about nullable regular expressions + mapM_ printWarning warnings return script + where + printWarning (WarnNullableRExp (AlexPn _ line col) msg) = + hPutStrLn stderr $ concat + [ "Warning: " + , file , ":", show line , ":" , show col , ": " + , msg + ] alex :: [CLIFlags] -> FilePath diff --git a/src/ParseMonad.hs b/src/ParseMonad.hs index 5f85cc6b..7272226d 100644 --- a/src/ParseMonad.hs +++ b/src/ParseMonad.hs @@ -1,5 +1,5 @@ -- ----------------------------------------------------------------------------- --- +-- -- ParseMonad.hs, part of Alex -- -- (c) Simon Marlow 2003 @@ -9,7 +9,7 @@ module ParseMonad ( AlexInput, alexInputPrevChar, alexGetChar, alexGetByte, AlexPosn(..), alexStartPos, - + Warning(..), warnIfNullable, P, runP, StartCode, failP, lookupSMac, lookupRMac, newSMac, newRMac, setStartCode, getStartCode, getInput, setInput, ) where @@ -23,7 +23,7 @@ import UTF8 #if __GLASGOW_HASKELL__ < 710 import Control.Applicative ( Applicative(..) ) #endif -import Control.Monad ( liftM, ap ) +import Control.Monad ( liftM, ap, when ) import Data.Word (Word8) -- ----------------------------------------------------------------------------- -- The input type @@ -49,7 +49,7 @@ alexGetChar (_, _ ,_ : _, _) = undefined -- hide compiler warning alexGetByte :: AlexInput -> Maybe (Byte,AlexInput) alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s)) alexGetByte (_,_,[],[]) = Nothing -alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c +alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c (b:bs) = UTF8.encode c in p' `seq` Just (b, (p', c, bs, s)) @@ -57,7 +57,7 @@ alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c -- Token positions -- `Posn' records the location of a token in the input text. It has three --- fields: the address (number of chacaters preceding the token), line number +-- fields: the address (number of charaters preceding the token), line number -- and column of a token within the file. `start_pos' gives the position of the -- start of the file and `eof_pos' a standard encoding for the end of file. -- `move_pos' calculates the new position after traversing a given character, @@ -77,15 +77,22 @@ alexMove (AlexPn a l c) _ = AlexPn (a+1) l (c+1) -- ----------------------------------------------------------------------------- -- Alex lexing/parsing monad +data Warning + = WarnNullableRExp + { _warnPos :: AlexPosn -- ^ The position of the code following the regex. + , _warnText :: String -- ^ Warning text. + } + type ParseError = (Maybe AlexPosn, String) type StartCode = Int -data PState = PState { - smac_env :: Map String CharSet, - rmac_env :: Map String RExp, - startcode :: Int, - input :: AlexInput - } +data PState = PState + { warnings :: [Warning] -- ^ Stack of warnings, top = last warning. + , smac_env :: Map String CharSet + , rmac_env :: Map String RExp + , startcode :: Int + , input :: AlexInput + } newtype P a = P { unP :: PState -> Either ParseError (PState,a) } @@ -102,15 +109,27 @@ instance Monad P where Right (env',ok) -> unP (k ok) env' return = pure -runP :: String -> (Map String CharSet, Map String RExp) - -> P a -> Either ParseError a -runP str (senv,renv) (P p) +-- | Run the parser on given input. +runP :: String + -- ^ Input string. + -> (Map String CharSet, Map String RExp) + -- ^ Character set and regex definitions. + -> P a + -- ^ Parsing computation. + -> Either ParseError ([Warning], a) + -- ^ List of warnings in first-to-last order, result. +runP str (senv,renv) (P p) = case p initial_state of Left err -> Left err - Right (_,a) -> Right a - where initial_state = - PState{ smac_env=senv, rmac_env=renv, - startcode = 0, input=(alexStartPos,'\n',[],str) } + Right (s, a) -> Right (reverse (warnings s), a) + where + initial_state = PState + { warnings = [] + , smac_env = senv + , rmac_env = renv + , startcode = 0 + , input = (alexStartPos, '\n', [], str) + } failP :: String -> P a failP str = P $ \PState{ input = (p,_,_,_) } -> Left (Just p,str) @@ -121,24 +140,24 @@ failP str = P $ \PState{ input = (p,_,_,_) } -> Left (Just p,str) lookupSMac :: (AlexPosn,String) -> P CharSet lookupSMac (posn,smac) - = P $ \s@PState{ smac_env = senv } -> + = P $ \s@PState{ smac_env = senv } -> case Map.lookup smac senv of Just ok -> Right (s,ok) Nothing -> Left (Just posn, "unknown set macro: $" ++ smac) lookupRMac :: String -> P RExp -lookupRMac rmac - = P $ \s@PState{ rmac_env = renv } -> +lookupRMac rmac + = P $ \s@PState{ rmac_env = renv } -> case Map.lookup rmac renv of Just ok -> Right (s,ok) Nothing -> Left (Nothing, "unknown regex macro: %" ++ rmac) newSMac :: String -> CharSet -> P () -newSMac smac set +newSMac smac set = P $ \s -> Right (s{smac_env = Map.insert smac set (smac_env s)}, ()) newRMac :: String -> RExp -> P () -newRMac rmac rexp +newRMac rmac rexp = P $ \s -> Right (s{rmac_env = Map.insert rmac rexp (rmac_env s)}, ()) setStartCode :: StartCode -> P () @@ -152,3 +171,21 @@ getInput = P $ \s -> Right (s, input s) setInput :: AlexInput -> P () setInput inp = P $ \s -> Right (s{ input = inp }, ()) + +-- | Add a warning if given regular expression is nullable +-- unless the user wrote the regex 'Eps'. +warnIfNullable + :: RExp -- ^ Regular expression. + -> AlexPosn -- ^ Position associated to regular expression. + -> P () +-- If the user wrote @()@, they wanted to match the empty sequence! +-- Thus, skip the warning then. +warnIfNullable Eps _ = return () +warnIfNullable r pos = when (nullable r) $ P $ \ s -> + Right (s{ warnings = WarnNullableRExp pos w : warnings s}, ()) + where + w = unwords + [ "Regular expression" + , show r + , "matches the empty string." + ] diff --git a/src/Parser.y b/src/Parser.y index 0747cb1f..bf627fa8 100644 --- a/src/Parser.y +++ b/src/Parser.y @@ -109,16 +109,23 @@ tokendefs :: { [RECtx] } | {- empty -} { [] } tokendef :: { [RECtx] } - : startcodes rule { [ replaceCodes $1 $2 ] } + : startcodes rule { [ replaceCodes $1 (snd $2) ] } | startcodes '{' rules '}' { map (replaceCodes $1) $3 } - | rule { [ $1 ] } - -rule :: { RECtx } - : context rhs { let (l,e,r) = $1 in - RECtx [] l e r $2 } + | rule {% do + let (pos, res@(RECtx _ _ e _ _)) = $1 + warnIfNullable e pos + return [ res ] + } + +rule :: { (AlexPosn, RECtx) } + : context rhs { let + (l, e, r) = $1 + (pos, code) = $2 + in (pos, RECtx [] l e r code) + } rules :: { [RECtx] } - : rule rules { $1 : $2 } + : rule rules { snd $1 : $2 } | {- empty -} { [] } startcodes :: { [(String,StartCode)] } @@ -132,9 +139,9 @@ startcode :: { String } : ZERO { "0" } | ID { $1 } -rhs :: { Maybe Code } - : CODE { case $1 of T _ (CodeT code) -> Just code } - | ';' { Nothing } +rhs :: { (AlexPosn, Maybe Code) } + : CODE { case $1 of T pos (CodeT code) -> (pos, Just code) } + | ';' { (tokPosn $1, Nothing) } context :: { Maybe CharSet, RExp, RightContext RExp } : left_ctx rexp right_ctx { (Just $1,$2,$3) }