diff --git a/.gitignore b/.gitignore index 594bbd9..151c619 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ /examples/*.happy.hs /examples/*.bin /examples/*.exe +/examples/wyvern/*.alex.hs /old-*/ /tests/*.[dign].hs /tests/*.[dign].bin diff --git a/examples/Makefile b/examples/Makefile index 6e844b7..39fec6b 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -1,4 +1,11 @@ +# NOTE: This logic follows the tests/Makefile for consistency. +ifndef ALEX +ALEX=$(shell which alex) +ifeq "$(filter $(dir $(shell pwd))%,$(ALEX))" "" ALEX=../dist/build/alex/alex +endif +endif + HC=ghc -Wall -fno-warn-unused-binds -fno-warn-missing-signatures -fno-warn-unused-matches -fno-warn-name-shadowing -fno-warn-unused-imports -fno-warn-tabs HAPPY=happy @@ -10,7 +17,7 @@ else exeext=.bin endif -PROGS = lit Tokens Tokens_gscan words words_posn words_monad tiny haskell tiger +PROGS = lit Tokens Tokens_gscan words words_posn words_monad tiny haskell tiger WyvernLexerV1 WyvernLexerV2 WyvernLexerV3 ALEX_OPTS = --template=../data/ -g @@ -54,6 +61,15 @@ haskell$(exeext) : haskell.alex.hs tiger$(exeext) : tiger.alex.hs $(HC) $(HC_OPTS) -main-is TigerLexer -o $@ $^ +WyvernLexerV1$(exeext) : wyvern/WyvernLexerV1.alex.hs + $(HC) $(HC_OPTS) -main-is WyvernLexerV1 -o $@ $^ + +WyvernLexerV2$(exeext) : wyvern/WyvernLexerV2.alex.hs + $(HC) $(HC_OPTS) -main-is WyvernLexerV2 -o $@ $^ + +WyvernLexerV3$(exeext) : wyvern/WyvernLexerV3.alex.hs + $(HC) $(HC_OPTS) -main-is WyvernLexerV3 -o $@ $^ + .PHONY: clean clean: rm -f *.o *.hi $(addsuffix $(exeext),$(PROGS)) \ diff --git a/examples/haskell.x b/examples/haskell.x index d4e899d..a6bafbd 100644 --- a/examples/haskell.x +++ b/examples/haskell.x @@ -11,7 +11,7 @@ { module Main (main) where -import Data.Char (chr) +import Data.Char (chr, ord) } %wrapper "monad" @@ -39,7 +39,7 @@ $idchar = [$alpha $digit \'] $symchar = [$symbol \:] $nl = [\n\r] -@reservedid = +@reservedid = as|case|class|data|default|deriving|do|else|hiding|if| import|in|infix|infixl|infixr|instance|let|module|newtype| of|qualified|then|type|where @@ -88,7 +88,7 @@ haskell :- <0> @varsym { mkL LVarSym } <0> @consym { mkL LConSym } -<0> @decimal +<0> @decimal | 0[oO] @octal | 0[xX] @hexadecimal { mkL LInteger } @@ -121,7 +121,7 @@ data LexemeClass | LQConSym | LEOF deriving Eq - + mkL :: LexemeClass -> AlexInput -> Int -> Alex Lexeme mkL c (p,_,_,str) len = return (L p c (take len str)) @@ -149,17 +149,17 @@ nested_comment _ _ = do Just (c,input) -> go n input c -> go n input - err input = do alexSetInput input; lexError "error in nested comment" + err input = do alexSetInput input; lexError "error in nested comment" lexError s = do (p,c,_,input) <- alexGetInput - alexError (showPosn p ++ ": " ++ s ++ + alexError (showPosn p ++ ": " ++ s ++ (if (not (null input)) then " before " ++ show (head input) else " at end of file")) scanner str = runAlex str $ do - let loop i = do tok@(L _ cl _) <- alexMonadScan; + let loop i = do tok@(L _ cl _) <- alexMonadScan; if cl == LEOF then return i else do loop $! (i+1) diff --git a/examples/tiger.x b/examples/tiger.x index c843f5a..e9ca44a 100644 --- a/examples/tiger.x +++ b/examples/tiger.x @@ -21,7 +21,7 @@ import System.Directory ( doesFileExist ) import Control.Monad import Data.Maybe import Numeric ( readDec ) -import Data.Char ( chr ) +import Data.Char ( chr, ord ) import Data.Map ( Map ) import qualified Data.Map as Map ( empty ) } diff --git a/examples/wyvern/WyvernLexerV1.x b/examples/wyvern/WyvernLexerV1.x new file mode 100644 index 0000000..1cf3526 --- /dev/null +++ b/examples/wyvern/WyvernLexerV1.x @@ -0,0 +1,49 @@ +{ +module WyvernLexerV1 + (main, + alexScanTokens, + Token(TokenAction, + TokenSoloIdentifier, + TokenOCB, + TokenCCB)) where +} + +%wrapper "basic" + +$digit = 0-9 +$alpha = [a-zA-Z] + +$idChar = [$alpha $digit \'] +$contentChar = [$alpha $digit $white \' \, \! \- \. \/ \? \= \< \> \[ \] \+ \( \)] + +@id = $idChar+ +@content = $contentChar+ + +tokens :- + + $white+ ; + @id [$white]+ \"@content\" { \s -> TokenAction s } + \"@content\" { \s -> TokenAction ("# " <> s) -- # is a placeholder id that will later be replaced by a unique identifier } + @id { \s -> TokenSoloIdentifier s } + \{ { \_ -> TokenOCB } + \} { \_ -> TokenCCB } + +{ +data Token + = TokenAction String + | TokenSoloIdentifier String + | TokenOCB + | TokenCCB + deriving Show + +main = do + putStrLn "Wyvern lexer v1 correct example: " + correctFileContent <- readFile "./wyvern/correct-input.txt" + let correctTokens = alexScanTokens correctFileContent + print correctTokens + + putStrLn "Wyvern lexer v1 incorrect example: " + incorrectFileContent <- readFile "./wyvern/incorrect-input.txt" + let incorrectTokens = alexScanTokens incorrectFileContent + print incorrectTokens +} diff --git a/examples/wyvern/WyvernLexerV2.x b/examples/wyvern/WyvernLexerV2.x new file mode 100644 index 0000000..b5047ef --- /dev/null +++ b/examples/wyvern/WyvernLexerV2.x @@ -0,0 +1,52 @@ +{ +module WyvernLexerV2 + (main, + alexScanTokens, + Token(TokenAction, + TokenSoloIdentifier, + TokenOCB, + TokenCCB), + AlexPosn(..)) where +} + +%wrapper "posn" + +$digit = 0-9 +$alpha = [a-zA-Z] + +$idChar = [$alpha $digit \'] +$contentChar = [$alpha $digit $white \' \, \! \- \. \/ \? \= \< \> \[ \] \+ \( \)] + +@id = $idChar+ +@content = $contentChar+ + +tokens :- + + $white+ ; + @id [$white]+ \"@content\" { (\position input -> TokenAction position input) } + \"@content\" { (\position input -> TokenAction position ("# " <> input)) -- # is a placeholder id that will later be replaced by a unique identifier } + @id { (\position input -> TokenSoloIdentifier position input) } + \{ { (\position _ -> TokenOCB position) } + \} { (\position _ -> TokenCCB position) } + +{ +-- Each token action (the right hand side function) is of type :: AlexPosn -> String -> Token + +data Token + = TokenAction AlexPosn String + | TokenSoloIdentifier AlexPosn String + | TokenOCB AlexPosn + | TokenCCB AlexPosn + deriving (Eq, Show) + +main = do + putStrLn "Wyvern lexer v2 correct example: " + correctFileContent <- readFile "./wyvern/correct-input.txt" + let correctTokens = alexScanTokens correctFileContent + print correctTokens + + putStrLn "Wyvern lexer v2 incorrect example: " + incorrectFileContent <- readFile "./wyvern/incorrect-input.txt" + let incorrectTokens = alexScanTokens incorrectFileContent + print incorrectTokens +} diff --git a/examples/wyvern/WyvernLexerV3.x b/examples/wyvern/WyvernLexerV3.x new file mode 100644 index 0000000..d59e6bf --- /dev/null +++ b/examples/wyvern/WyvernLexerV3.x @@ -0,0 +1,66 @@ +{ +module WyvernLexerV3 + (main, + lexAll, + runAlex, + Token(TokenAction, + TokenSoloIdentifier, + TokenOCB, + TokenCCB), + AlexPosn(..)) where +} + +%wrapper "monad" + +$digit = 0-9 +$alpha = [a-zA-Z] + +$idChar = [$alpha $digit \'] +$contentChar = [$alpha $digit $white \' \, \! \- \. \/ \? \= \< \> \[ \] \+ \( \)] + +@id = $idChar+ +@content = $contentChar+ + +tokens :- + + $white+ ; + @id [$white]+ \"@content\" { (\(position, _previousCharacter, _bytes, inputString) len -> return $ TokenAction position (take len inputString)) } + \"@content\" { (\(position, _previousCharacter, _bytes, inputString) len -> return $ TokenAction position ("# " <> take len inputString)) -- # is a placeholder id that will later be replaced by a unique identifier } + @id { (\(position, _previousCharacter, _bytes, inputString) len -> return $ TokenSoloIdentifier position (take len inputString)) } + \{ { (\(position, _previousCharacter, _bytes, _inputString) len -> return $ TokenOCB position) } + \} { (\(position, _previousCharacter, _bytes, _inputString) len -> return $ TokenCCB position) } + +{ +-- Each token action (the right hand side function) is of type :: AlexInput -> Int -> Alex Token + +data Token + = TokenAction AlexPosn String + | TokenSoloIdentifier AlexPosn String + | TokenOCB AlexPosn + | TokenCCB AlexPosn + | TokenEOF + deriving (Eq, Show) + +alexEOF :: Alex Token +alexEOF = return TokenEOF + +lexAll :: Alex [Token] +lexAll = go + where + go = do + t <- alexMonadScan + case t of + TokenEOF -> return [] + _ -> (t:) <$> go + +main = do + putStrLn "Wyvern lexer v3 correct example: " + correctFileContent <- readFile "./wyvern/correct-input.txt" + let correctTokens = runAlex correctFileContent lexAll + print correctTokens + + putStrLn "Wyvern lexer v3 incorrect example: " + incorrectFileContent <- readFile "./wyvern/incorrect-input.txt" + let incorrectTokens = runAlex incorrectFileContent lexAll + print incorrectTokens +} diff --git a/examples/wyvern/correct-input.txt b/examples/wyvern/correct-input.txt new file mode 100644 index 0000000..ff33a7b --- /dev/null +++ b/examples/wyvern/correct-input.txt @@ -0,0 +1,9 @@ +"action" +"question" +{ + "action" +} +{ + "action" +} +"action" diff --git a/examples/wyvern/incorrect-input.txt b/examples/wyvern/incorrect-input.txt new file mode 100644 index 0000000..f1512d5 --- /dev/null +++ b/examples/wyvern/incorrect-input.txt @@ -0,0 +1,9 @@ +"action" +"question" +{ + "action"| +} +{ + "action" +} +"action" diff --git a/examples/wyvern/readme.md b/examples/wyvern/readme.md new file mode 100644 index 0000000..c261827 --- /dev/null +++ b/examples/wyvern/readme.md @@ -0,0 +1,141 @@ +# wyvern - sample lexers + +Below are three sample alex files illustrating how to use various alex wrappers and the differences between them: + +* `basic` +* `posn` +* `monad` + +All examples are taken from [wyvern-diagrams](https://hackage.haskell.org/package/wyvern-diagrams). + +All lexers return a list of tokens (plus some additional information like token position in example 2 and 3) so if the provided input is correct, the tokens can be used directly in parsers. + +## input + +### correct + +[source](./correct-input.txt) + +``` +"action" +"question" +{ + "action" +} +{ + "action" +} +"action" +``` + +### incorrect + +[source](./incorrect-input.txt) + +``` +"action" +"question" +{ + "action"| +} +{ + "action" +} +"action" +``` + +## `basic` - lexer v1 + +[source](./WyvernLexerV1.x) + +### features + +* easy to implement +* extracts tokens as advertised +* very useful for your first lexer +* no information about token/error positions +* errors out on incorrect input + +### usage + +While in the `examples` directory: + +```bash +make +./WyvernLexerV1.[bin/exe] +``` + +### output + +```bash +Wyvern lexer v1 correct example: +[TokenAction "# \"action\"",TokenAction "# \"question\"",TokenOCB,TokenAction "# \"action\"",TokenCCB,TokenOCB,TokenAction "# \"action\"",TokenCCB,TokenAction "# \"action\""] +Wyvern lexer v1 incorrect example: +WyvernLexerV1.bin: lexical error +CallStack (from HasCallStack): + error, called at wyvern/WyvernLexerV1.alex.hs:494:32 in main:WyvernLexerV1 +``` + +## `posn` - lexer v2 + +[source](./WyvernLexerV2.x) + +### features + +* only slightly more difficult to implement compared to the basic wrapper +* extracts tokens as advertised but also reveals more information about: + * token positions + * error positions +* errors out on incorrect input + +### usage + +While in the `examples` directory: + +```bash +make +./WyvernLexerV2.[bin/exe] +``` + +### output + +```bash +Wyvern lexer v2 correct example: +[TokenAction (AlexPn 0 1 1) "# \"action\"",TokenAction (AlexPn 9 2 1) "# \"question\"",TokenOCB (AlexPn 20 3 1),TokenAction (AlexPn 26 4 5) "# \"action\"",TokenCCB (AlexPn 35 5 1),TokenOCB (AlexPn 37 6 1),TokenAction (AlexPn 43 7 5) "# \"action\"",TokenCCB (AlexPn 52 8 1),TokenAction (AlexPn 54 9 1) "# \"action\""] +Wyvern lexer v2 incorrect example: +WyvernLexerV2.bin: lexical error at line 4, column 13 +CallStack (from HasCallStack): + error, called at wyvern/WyvernLexerV2.alex.hs:574:61 in main:WyvernLexerV2 +``` + +## `monad` - lexer v3 + +[source](./WyvernLexerV3.x) + +### features + +* only slightly more difficult to implement compared to the posn wrapper but has a different interface (`alexMonadScan`, `runAlex`) +* just like the posn wrapper, it extracts tokens as advertised but also reveals more information about: + * token positions + * error positions +* does not error out on incorrect input but instead returns the result as `Either`: + * string error + * list of tokens + +### usage + +While in the `examples` directory: + +```bash +make +./WyvernLexerV3.[bin/exe] +``` + +### output + +```bash +Wyvern lexer v3 correct example: +Right [TokenAction (AlexPn 0 1 1) "# \"action\"",TokenAction (AlexPn 9 2 1) "# \"question\"",TokenOCB (AlexPn 20 3 1),TokenAction (AlexPn 26 4 5) "# \"action\"",TokenCCB (AlexPn 35 5 1),TokenOCB (AlexPn 37 6 1),TokenAction (AlexPn 43 7 5) "# \"action\"",TokenCCB (AlexPn 52 8 1),TokenAction (AlexPn 54 9 1) "# \"action\""] +Wyvern lexer v3 incorrect example: +Left "lexical error at line 4, column 13" +```