Skip to content

Commit 574ec8c

Browse files
authored
Merge pull request #156 from andreasabel/issue119
[ fixed #119 ] latin1 encoding: each byte counts as 1 char
2 parents ce48441 + ae525e3 commit 574ec8c

File tree

7 files changed

+154
-30
lines changed

7 files changed

+154
-30
lines changed

alex.cabal

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,21 @@ data-dir: data/
3838

3939
data-files:
4040
AlexTemplate
41+
AlexTemplate-debug
42+
AlexTemplate-nopred
43+
AlexTemplate-nopred-debug
44+
AlexTemplate-latin1
45+
AlexTemplate-latin1-debug
46+
AlexTemplate-latin1-nopred
47+
AlexTemplate-latin1-nopred-debug
4148
AlexTemplate-ghc
42-
AlexTemplate-ghc-nopred
4349
AlexTemplate-ghc-debug
44-
AlexTemplate-debug
50+
AlexTemplate-ghc-nopred
51+
AlexTemplate-ghc-nopred-debug
52+
AlexTemplate-ghc-latin1
53+
AlexTemplate-ghc-latin1-debug
54+
AlexTemplate-ghc-latin1-nopred
55+
AlexTemplate-ghc-latin1-nopred-debug
4556
AlexWrapper-basic
4657
AlexWrapper-basic-bytestring
4758
AlexWrapper-strict-bytestring
@@ -110,6 +121,7 @@ extra-source-files:
110121
tests/strict_typeclass.x
111122
tests/unicode.x
112123
tests/issue_71.x
124+
tests/issue_119.x
113125

114126
source-repository head
115127
type: git

gen-alex-sdist/Main.hs

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
module Main (main) where
22

33
import Control.Monad
4+
import qualified Data.List as List
45
import Language.Preprocessor.Cpphs
56
import System.Directory
67
import System.FilePath
@@ -51,13 +52,37 @@ all_template_files :: [FilePath]
5152
all_template_files = map fst (templates ++ wrappers)
5253

5354
templates :: [(FilePath,[String])]
54-
templates = [
55-
("AlexTemplate", []),
56-
("AlexTemplate-ghc", ["ALEX_GHC"]),
57-
("AlexTemplate-ghc-nopred",["ALEX_GHC", "ALEX_NOPRED"]),
58-
("AlexTemplate-ghc-debug", ["ALEX_GHC","ALEX_DEBUG"]),
59-
("AlexTemplate-debug", ["ALEX_DEBUG"])
60-
]
55+
templates =
56+
[ ( templateFileName ghc latin1 nopred debug
57+
, templateFlags ghc latin1 nopred debug
58+
)
59+
| ghc <- allBool
60+
, latin1 <- allBool
61+
, nopred <- allBool
62+
, debug <- allBool
63+
]
64+
where
65+
allBool = [False, True]
66+
67+
-- Keep this function in sync with its twin in src/Main.hs.
68+
templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath
69+
templateFileName ghc latin1 nopred debug =
70+
List.intercalate "-" $ concat
71+
[ [ "AlexTemplate" ]
72+
, [ "ghc" | ghc ]
73+
, [ "latin1" | latin1 ]
74+
, [ "nopred" | nopred ]
75+
, [ "debug" | debug ]
76+
]
77+
78+
templateFlags :: Bool -> Bool -> Bool -> Bool -> [String]
79+
templateFlags ghc latin1 nopred debug =
80+
map ("ALEX_" ++) $ concat
81+
[ [ "GHC" | ghc ]
82+
, [ "LATIN1" | latin1 ]
83+
, [ "NOPRED" | nopred ]
84+
, [ "DEBUG" | debug ]
85+
]
6186

6287
wrappers :: [(FilePath,[String])]
6388
wrappers = [

src/AbsSyn.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ type StartCode = Int
170170
-- we can generate somewhat faster code in the case that
171171
-- the lexer doesn't use predicates
172172
data UsesPreds = UsesPreds | DoesntUsePreds
173+
deriving Eq
173174

174175
usesPreds :: DFA s a -> UsesPreds
175176
usesPreds dfa
@@ -390,3 +391,4 @@ extractActions scheme scanner = (scanner{scannerTokens = new_tokens}, decl_str .
390391
-- Code generation targets
391392

392393
data Target = GhcTarget | HaskellTarget
394+
deriving Eq

src/Main.hs

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import Control.Exception ( bracketOnError )
3333
import Control.Monad ( when, liftM )
3434
import Data.Char ( chr )
3535
import Data.List ( isSuffixOf, nub )
36+
import qualified Data.List as List
3637
import Data.Maybe ( isJust, fromJust )
3738
import Data.Version ( showVersion )
3839
import System.Console.GetOpt ( getOpt, usageInfo, ArgOrder(..), OptDescr(..), ArgDescr(..) )
@@ -222,7 +223,7 @@ alex cli file basename script = do
222223
hPutStr out_h (actions "")
223224

224225
-- add the template
225-
let template_name = templateFile template_dir target usespreds cli
226+
let template_name = templateFile template_dir target encoding usespreds cli
226227
tmplt <- alexReadFile template_name
227228
hPutStr out_h tmplt
228229

@@ -403,23 +404,27 @@ templateDir def cli
403404
[] -> def
404405
ds -> return (last ds)
405406

406-
templateFile :: FilePath -> Target -> UsesPreds -> [CLIFlags] -> FilePath
407-
templateFile dir target usespreds cli
408-
= dir ++ "/AlexTemplate" ++ maybe_ghc ++ maybe_debug ++ maybe_nopred
409-
where
410-
maybe_ghc = case target of
411-
GhcTarget -> "-ghc"
412-
_ -> ""
413-
414-
maybe_debug
415-
| OptDebugParser `elem` cli = "-debug"
416-
| otherwise = ""
417-
418-
maybe_nopred =
419-
case usespreds of
420-
DoesntUsePreds | not (null maybe_ghc)
421-
&& null maybe_debug -> "-nopred"
422-
_ -> ""
407+
-- Keep this function in sync with its twin in gen-alex-sdist/Main.hs.
408+
templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath
409+
templateFileName ghc latin1 nopred debug =
410+
List.intercalate "-" $ concat
411+
[ [ "AlexTemplate" ]
412+
, [ "ghc" | ghc ]
413+
, [ "latin1" | latin1 ]
414+
, [ "nopred" | nopred ]
415+
, [ "debug" | debug ]
416+
]
417+
418+
templateFile :: FilePath -> Target -> Encoding -> UsesPreds -> [CLIFlags] -> FilePath
419+
templateFile dir target encoding usespreds cli = concat
420+
[ dir
421+
, "/"
422+
, templateFileName
423+
(target == GhcTarget)
424+
(encoding == Latin1)
425+
(usespreds == DoesntUsePreds)
426+
(OptDebugParser `elem` cli)
427+
]
423428

424429
wrapperFile :: FilePath -> Scheme -> Maybe FilePath
425430
wrapperFile dir scheme =

templates/GenericTemplate.hs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,15 @@ alex_scan_tkn user__ orig_input len input__ s last_acc =
175175
ILIT(-1) -> (new_acc, input__)
176176
-- on an error, we want to keep the input *before* the
177177
-- character that failed, not after.
178-
_ -> alex_scan_tkn user__ orig_input (if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len)
179-
-- note that the length is increased ONLY if this is the 1st byte in a char encoding)
180-
new_input new_s new_acc
178+
_ -> alex_scan_tkn user__ orig_input
179+
#ifdef ALEX_LATIN1
180+
PLUS(len,ILIT(1))
181+
-- issue 119: in the latin1 encoding, *each* byte is one character
182+
#else
183+
(if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len)
184+
-- note that the length is increased ONLY if this is the 1st byte in a char encoding)
185+
#endif
186+
new_input new_s new_acc
181187
}
182188
where
183189
check_accs (AlexAccNone) = last_acc

tests/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ TESTS = \
4141
default_typeclass.x \
4242
gscan_typeclass.x \
4343
issue_71.x \
44+
issue_119.x \
4445
monad_typeclass.x \
4546
monad_typeclass_bytestring.x \
4647
monadUserState_typeclass.x \

tests/issue_119.x

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
-- -*- haskell -*-
2+
{
3+
-- Issue 119,
4+
-- reported 2017-10-11 by Herbert Valerio Riedel,
5+
-- fixed 2020-01-26 by Andreas Abel.
6+
--
7+
-- Problem was: the computed token length (in number of characters)
8+
-- attached to AlexToken is tailored to UTF8 encoding and wrong
9+
-- for LATIN1 encoding.
10+
11+
module Main where
12+
13+
import Control.Monad (unless)
14+
import qualified Data.ByteString as B
15+
import Data.Word
16+
import System.Exit (exitFailure)
17+
}
18+
19+
%encoding "latin1"
20+
21+
:-
22+
23+
[\x01-\xff]+ { False }
24+
[\x00] { True }
25+
26+
{
27+
type AlexInput = B.ByteString
28+
29+
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
30+
alexGetByte = B.uncons
31+
32+
alexInputPrevChar :: AlexInput -> Char
33+
alexInputPrevChar = undefined
34+
35+
-- generated by @alex@
36+
alexScan :: AlexInput -> Int -> AlexReturn Bool
37+
38+
{-
39+
40+
GOOD cases:
41+
42+
("012\NUL3","012","\NUL3",3,3,False)
43+
("\NUL0","\NUL","0",1,1,True)
44+
("012","012","",3,3,False)
45+
46+
BAD case:
47+
48+
("0@P`p\128\144\160","0@P`p","",5,8,False)
49+
50+
expected:
51+
52+
("0@P`p\128\144\160","0@P`p\128\144\160","",8,8,False)
53+
54+
-}
55+
main :: IO ()
56+
main = do
57+
go (B.pack [0x30,0x31,0x32,0x00,0x33]) -- GOOD
58+
go (B.pack [0x00,0x30]) -- GOOD
59+
go (B.pack [0x30,0x31,0x32]) -- GOOD
60+
61+
go (B.pack [0x30,0x40,0x50,0x60,0x70,0x80,0x90,0xa0]) -- WAS: BAD
62+
where
63+
go inp = do
64+
case (alexScan inp 0) of
65+
-- expected invariant: len == B.length inp - B.length inp'
66+
AlexToken inp' len b -> do
67+
let diff = B.length inp - B.length inp'
68+
unless (len == diff) $ do
69+
putStrLn $ "ERROR: reported length and consumed length differ!"
70+
print (inp, B.take len inp, inp', len, diff, b)
71+
exitFailure
72+
_ -> undefined
73+
}

0 commit comments

Comments
 (0)