Skip to content

Commit ae525e3

Browse files
committed
[ fixed #119 ] latin1 encoding: each byte counts as 1 char
The computation of the length component of AlexToken was tailored to the utf8 encoding, and didn't work correctly for latin1. This is fixed by having a new flag ALEX_LATIN1 in templates/GenericTemplate.hs that turns on code that increases the length by 1 for each byte, while for utf8 something more sophisticated is done. The fix requires more template instances to be generated. To streamline the instance generation, now all 2^4 = 16 template instances are generated for the 4 flags - ghc - latin1 - nopred - debug To ensure consistent reference to the template instance, a function templateFileName residing both in src/Main and gen-alex-sdist/Main needs to be kept consistent, should more dimensions be added to the template. (Putting this function into a separate file that is included by both modules could be an option, but seemed not enough in the spirit of cabal-organized projects.)
1 parent 0198f73 commit ae525e3

File tree

7 files changed

+154
-30
lines changed

7 files changed

+154
-30
lines changed

alex.cabal

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,21 @@ data-dir: data/
3838

3939
data-files:
4040
AlexTemplate
41+
AlexTemplate-debug
42+
AlexTemplate-nopred
43+
AlexTemplate-nopred-debug
44+
AlexTemplate-latin1
45+
AlexTemplate-latin1-debug
46+
AlexTemplate-latin1-nopred
47+
AlexTemplate-latin1-nopred-debug
4148
AlexTemplate-ghc
42-
AlexTemplate-ghc-nopred
4349
AlexTemplate-ghc-debug
44-
AlexTemplate-debug
50+
AlexTemplate-ghc-nopred
51+
AlexTemplate-ghc-nopred-debug
52+
AlexTemplate-ghc-latin1
53+
AlexTemplate-ghc-latin1-debug
54+
AlexTemplate-ghc-latin1-nopred
55+
AlexTemplate-ghc-latin1-nopred-debug
4556
AlexWrapper-basic
4657
AlexWrapper-basic-bytestring
4758
AlexWrapper-strict-bytestring
@@ -110,6 +121,7 @@ extra-source-files:
110121
tests/strict_typeclass.x
111122
tests/unicode.x
112123
tests/issue_71.x
124+
tests/issue_119.x
113125

114126
source-repository head
115127
type: git

gen-alex-sdist/Main.hs

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
module Main (main) where
22

33
import Control.Monad
4+
import qualified Data.List as List
45
import Language.Preprocessor.Cpphs
56
import System.Directory
67
import System.FilePath
@@ -51,13 +52,37 @@ all_template_files :: [FilePath]
5152
all_template_files = map fst (templates ++ wrappers)
5253

5354
templates :: [(FilePath,[String])]
54-
templates = [
55-
("AlexTemplate", []),
56-
("AlexTemplate-ghc", ["ALEX_GHC"]),
57-
("AlexTemplate-ghc-nopred",["ALEX_GHC", "ALEX_NOPRED"]),
58-
("AlexTemplate-ghc-debug", ["ALEX_GHC","ALEX_DEBUG"]),
59-
("AlexTemplate-debug", ["ALEX_DEBUG"])
60-
]
55+
templates =
56+
[ ( templateFileName ghc latin1 nopred debug
57+
, templateFlags ghc latin1 nopred debug
58+
)
59+
| ghc <- allBool
60+
, latin1 <- allBool
61+
, nopred <- allBool
62+
, debug <- allBool
63+
]
64+
where
65+
allBool = [False, True]
66+
67+
-- Keep this function in sync with its twin in src/Main.hs.
68+
templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath
69+
templateFileName ghc latin1 nopred debug =
70+
List.intercalate "-" $ concat
71+
[ [ "AlexTemplate" ]
72+
, [ "ghc" | ghc ]
73+
, [ "latin1" | latin1 ]
74+
, [ "nopred" | nopred ]
75+
, [ "debug" | debug ]
76+
]
77+
78+
templateFlags :: Bool -> Bool -> Bool -> Bool -> [String]
79+
templateFlags ghc latin1 nopred debug =
80+
map ("ALEX_" ++) $ concat
81+
[ [ "GHC" | ghc ]
82+
, [ "LATIN1" | latin1 ]
83+
, [ "NOPRED" | nopred ]
84+
, [ "DEBUG" | debug ]
85+
]
6186

6287
wrappers :: [(FilePath,[String])]
6388
wrappers = [

src/AbsSyn.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ type StartCode = Int
170170
-- we can generate somewhat faster code in the case that
171171
-- the lexer doesn't use predicates
172172
data UsesPreds = UsesPreds | DoesntUsePreds
173+
deriving Eq
173174

174175
usesPreds :: DFA s a -> UsesPreds
175176
usesPreds dfa
@@ -390,3 +391,4 @@ extractActions scheme scanner = (scanner{scannerTokens = new_tokens}, decl_str .
390391
-- Code generation targets
391392

392393
data Target = GhcTarget | HaskellTarget
394+
deriving Eq

src/Main.hs

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import Control.Exception ( bracketOnError )
3333
import Control.Monad ( when, liftM )
3434
import Data.Char ( chr )
3535
import Data.List ( isSuffixOf, nub )
36+
import qualified Data.List as List
3637
import Data.Maybe ( isJust, fromJust )
3738
import Data.Version ( showVersion )
3839
import System.Console.GetOpt ( getOpt, usageInfo, ArgOrder(..), OptDescr(..), ArgDescr(..) )
@@ -218,7 +219,7 @@ alex cli file basename script = do
218219
hPutStr out_h (actions "")
219220

220221
-- add the template
221-
let template_name = templateFile template_dir target usespreds cli
222+
let template_name = templateFile template_dir target encoding usespreds cli
222223
tmplt <- alexReadFile template_name
223224
hPutStr out_h tmplt
224225

@@ -399,23 +400,27 @@ templateDir def cli
399400
[] -> def
400401
ds -> return (last ds)
401402

402-
templateFile :: FilePath -> Target -> UsesPreds -> [CLIFlags] -> FilePath
403-
templateFile dir target usespreds cli
404-
= dir ++ "/AlexTemplate" ++ maybe_ghc ++ maybe_debug ++ maybe_nopred
405-
where
406-
maybe_ghc = case target of
407-
GhcTarget -> "-ghc"
408-
_ -> ""
409-
410-
maybe_debug
411-
| OptDebugParser `elem` cli = "-debug"
412-
| otherwise = ""
413-
414-
maybe_nopred =
415-
case usespreds of
416-
DoesntUsePreds | not (null maybe_ghc)
417-
&& null maybe_debug -> "-nopred"
418-
_ -> ""
403+
-- Keep this function in sync with its twin in gen-alex-sdist/Main.hs.
404+
templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath
405+
templateFileName ghc latin1 nopred debug =
406+
List.intercalate "-" $ concat
407+
[ [ "AlexTemplate" ]
408+
, [ "ghc" | ghc ]
409+
, [ "latin1" | latin1 ]
410+
, [ "nopred" | nopred ]
411+
, [ "debug" | debug ]
412+
]
413+
414+
templateFile :: FilePath -> Target -> Encoding -> UsesPreds -> [CLIFlags] -> FilePath
415+
templateFile dir target encoding usespreds cli = concat
416+
[ dir
417+
, "/"
418+
, templateFileName
419+
(target == GhcTarget)
420+
(encoding == Latin1)
421+
(usespreds == DoesntUsePreds)
422+
(OptDebugParser `elem` cli)
423+
]
419424

420425
wrapperFile :: FilePath -> Scheme -> Maybe FilePath
421426
wrapperFile dir scheme =

templates/GenericTemplate.hs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,15 @@ alex_scan_tkn user__ orig_input len input__ s last_acc =
175175
ILIT(-1) -> (new_acc, input__)
176176
-- on an error, we want to keep the input *before* the
177177
-- character that failed, not after.
178-
_ -> alex_scan_tkn user__ orig_input (if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len)
179-
-- note that the length is increased ONLY if this is the 1st byte in a char encoding)
180-
new_input new_s new_acc
178+
_ -> alex_scan_tkn user__ orig_input
179+
#ifdef ALEX_LATIN1
180+
PLUS(len,ILIT(1))
181+
-- issue 119: in the latin1 encoding, *each* byte is one character
182+
#else
183+
(if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len)
184+
-- note that the length is increased ONLY if this is the 1st byte in a char encoding)
185+
#endif
186+
new_input new_s new_acc
181187
}
182188
where
183189
check_accs (AlexAccNone) = last_acc

tests/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ TESTS = \
4141
default_typeclass.x \
4242
gscan_typeclass.x \
4343
issue_71.x \
44+
issue_119.x \
4445
monad_typeclass.x \
4546
monad_typeclass_bytestring.x \
4647
monadUserState_typeclass.x \

tests/issue_119.x

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
-- -*- haskell -*-
2+
{
3+
-- Issue 119,
4+
-- reported 2017-10-11 by Herbert Valerio Riedel,
5+
-- fixed 2020-01-26 by Andreas Abel.
6+
--
7+
-- Problem was: the computed token length (in number of characters)
8+
-- attached to AlexToken is tailored to UTF8 encoding and wrong
9+
-- for LATIN1 encoding.
10+
11+
module Main where
12+
13+
import Control.Monad (unless)
14+
import qualified Data.ByteString as B
15+
import Data.Word
16+
import System.Exit (exitFailure)
17+
}
18+
19+
%encoding "latin1"
20+
21+
:-
22+
23+
[\x01-\xff]+ { False }
24+
[\x00] { True }
25+
26+
{
27+
type AlexInput = B.ByteString
28+
29+
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
30+
alexGetByte = B.uncons
31+
32+
alexInputPrevChar :: AlexInput -> Char
33+
alexInputPrevChar = undefined
34+
35+
-- generated by @alex@
36+
alexScan :: AlexInput -> Int -> AlexReturn Bool
37+
38+
{-
39+
40+
GOOD cases:
41+
42+
("012\NUL3","012","\NUL3",3,3,False)
43+
("\NUL0","\NUL","0",1,1,True)
44+
("012","012","",3,3,False)
45+
46+
BAD case:
47+
48+
("0@P`p\128\144\160","0@P`p","",5,8,False)
49+
50+
expected:
51+
52+
("0@P`p\128\144\160","0@P`p\128\144\160","",8,8,False)
53+
54+
-}
55+
main :: IO ()
56+
main = do
57+
go (B.pack [0x30,0x31,0x32,0x00,0x33]) -- GOOD
58+
go (B.pack [0x00,0x30]) -- GOOD
59+
go (B.pack [0x30,0x31,0x32]) -- GOOD
60+
61+
go (B.pack [0x30,0x40,0x50,0x60,0x70,0x80,0x90,0xa0]) -- WAS: BAD
62+
where
63+
go inp = do
64+
case (alexScan inp 0) of
65+
-- expected invariant: len == B.length inp - B.length inp'
66+
AlexToken inp' len b -> do
67+
let diff = B.length inp - B.length inp'
68+
unless (len == diff) $ do
69+
putStrLn $ "ERROR: reported length and consumed length differ!"
70+
print (inp, B.take len inp, inp', len, diff, b)
71+
exitFailure
72+
_ -> undefined
73+
}

0 commit comments

Comments
 (0)