Skip to content

Commit d44ff6a

Browse files
committed
Update to return full source ranges, 0-based line and column numbers
1 parent 85684b7 commit d44ff6a

File tree

12 files changed

+161
-137
lines changed

12 files changed

+161
-137
lines changed

Readme.md

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
Inchworm is a simple parser combinator framework specialized to
44
lexical analysis.
5-
Tokens can be specified via simple fold functions,
6-
and we include baked in source location handling.
5+
Tokens are specified via simple fold functions, and we include
6+
baked in source location handling.
77

88
If you want to parse expressions instead of performing lexical
99
analysis then try the `parsec` or `attoparsec` packages, which
@@ -32,18 +32,18 @@ error.
3232

3333
```
3434
import Text.Lexer.Inchworm.Char
35-
import qualified Data.Char as Char
36-
35+
import qualified Data.Char as Char
36+
3737
-- | A source token.
3838
data Token
3939
= KBra | KKet | KVar String | KCon String | KInt Integer
4040
deriving Show
41-
41+
4242
-- | A thing with attached location information.
4343
data Located a
44-
= Located FilePath Location a
44+
= Located FilePath (Range Location) a
4545
deriving Show
46-
46+
4747
-- | Scanner for a lispy language.
4848
scanner :: FilePath
4949
-> Scanner IO Location [Char] (Located Token)
@@ -58,11 +58,12 @@ scanner fileName
5858
, fmap (stamp KCon)
5959
$ munchWord (\ix c -> if ix == 0 then Char.isUpper c
6060
else Char.isAlpha c)
61-
]
61+
]
6262
where -- Stamp a token with source location information.
63-
stamp k (l, t)
64-
= Located fileName l (k t)
65-
63+
stamp k (range, t)
64+
= Located fileName range (k t)
65+
66+
main :: IO ()
6667
main
6768
= do let fileName = "Source.lispy"
6869
let source = "(some (Lispy like) 26 Program 93 (for you))"

Text/Lexer/Inchworm.hs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,19 @@
1111
-- are in the "Text.Lexer.Inchworm.Char" module.
1212
--
1313
-- No dependencies other than the Haskell 'base' library.
14+
--
15+
-- __ Release Notes __
16+
--
17+
-- @
18+
-- For 1.1.1.1:
19+
-- * Matching combinators now produce the first and final locations
20+
-- that matched.
21+
-- * Line and column offsets are now 0-based instead of 1-based,
22+
-- for easier inteface with client editors that expect this (eg VSCode).
23+
-- Thanks to Amos Robinson:
24+
-- * Haskell string parser now correctly handles strings gaps and the
25+
-- string escape character \\&
26+
-- @
1427
--
1528
-- __ Minimal example __
1629
--

Text/Lexer/Inchworm/Char.hs

Lines changed: 37 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ module Text.Lexer.Inchworm.Char
77
, scanStringIO
88

99
-- * Locations
10-
, Location (..)
10+
, Range (..), Location (..)
1111
, bumpLocationWithChar
1212

1313
-- * Scanners
@@ -33,7 +33,7 @@ scanStringIO
3333

3434
scanStringIO str scanner
3535
= scanListIO
36-
(Location 1 1)
36+
(Location 0 0)
3737
bumpLocationWithChar
3838
str scanner
3939

@@ -45,35 +45,35 @@ scanStringIO str scanner
4545
bumpLocationWithChar :: Char -> Location -> Location
4646
bumpLocationWithChar c (Location line col)
4747
= case c of
48-
'\n' -> Location (line + 1) 1
48+
'\n' -> Location (line + 1) 0
4949
_ -> Location line (col + 1)
5050

5151

5252
-- Integers -------------------------------------------------------------------
5353
-- | Scan a decimal integer, with optional @-@ and @+@ sign specifiers.
5454
scanInteger
5555
:: Monad m
56-
=> Scanner m loc [Char] (loc, Integer)
56+
=> Scanner m loc [Char] (Range loc, Integer)
5757

5858
scanInteger
5959
= munchPred Nothing matchInt acceptInt
6060
where
6161
matchInt 0 !c
6262
= c == '-' || c == '+' || Char.isDigit c
6363

64-
matchInt _ !c = Char.isDigit c
64+
matchInt _ !c = Char.isDigit c
6565

6666
acceptInt ('+' : cs)
67-
| null cs = Nothing
67+
| null cs = Nothing
6868

6969
acceptInt ('-' : cs)
70-
| null cs = Nothing
70+
| null cs = Nothing
7171

72-
acceptInt cs = Just $ read cs
72+
acceptInt cs = Just $ read cs
7373

7474
{-# SPECIALIZE INLINE
7575
scanInteger
76-
:: Scanner IO Location [Char] (Location, Integer)
76+
:: Scanner IO Location [Char] (Range Location, Integer)
7777
#-}
7878

7979
-- Strings --------------------------------------------------------------------
@@ -83,7 +83,7 @@ scanInteger
8383
--
8484
scanHaskellString
8585
:: Monad m
86-
=> Scanner m loc [Char] (loc, String)
86+
=> Scanner m loc [Char] (Range loc, String)
8787

8888
scanHaskellString
8989
= munchFold Nothing matchC (False, False) acceptC
@@ -112,7 +112,7 @@ scanHaskellString
112112

113113
{-# SPECIALIZE INLINE
114114
scanHaskellString
115-
:: Scanner IO Location [Char] (Location, String)
115+
:: Scanner IO Location [Char] (Range Location, String)
116116
#-}
117117

118118

@@ -123,7 +123,7 @@ scanHaskellString
123123
--
124124
scanHaskellChar
125125
:: Monad m
126-
=> Scanner m loc [Char] (loc, Char)
126+
=> Scanner m loc [Char] (Range loc, Char)
127127

128128
scanHaskellChar
129129
= munchFold Nothing matchC (False, False) acceptC
@@ -141,23 +141,24 @@ scanHaskellChar
141141

142142
acceptC ('\'' : cs)
143143
= case readChar cs of
144-
-- Character literals do not support gaps or escape terminators
144+
-- Character literals do not support gaps or
145+
-- escape terminators
145146
Just (Just c, "\'") -> Just c
146147
_ -> Nothing
147148

148149
acceptC _ = Nothing
149150

150151
{-# SPECIALIZE INLINE
151152
scanHaskellChar
152-
:: Scanner IO Location [Char] (Location, Char)
153+
:: Scanner IO Location [Char] (Range Location, Char)
153154
#-}
154155

155156

156157
-- Comments -------------------------------------------------------------------
157158
-- | Scan a Haskell block comment.
158159
scanHaskellCommentBlock
159160
:: Monad m
160-
=> Scanner m loc [Char] (loc, String)
161+
=> Scanner m loc [Char] (Range loc, String)
161162

162163
scanHaskellCommentBlock
163164
= munchFold Nothing matchC (' ', True) acceptC
@@ -177,14 +178,14 @@ scanHaskellCommentBlock
177178

178179
{-# SPECIALIZE INLINE
179180
scanHaskellCommentBlock
180-
:: Scanner IO Location [Char] (Location, String)
181+
:: Scanner IO Location [Char] (Range Location, String)
181182
#-}
182183

183184

184185
-- | Scan a Haskell line comment.
185186
scanHaskellCommentLine
186187
:: Monad m
187-
=> Scanner m loc [Char] (loc, String)
188+
=> Scanner m loc [Char] (Range loc, String)
188189

189190
scanHaskellCommentLine
190191
= munchPred Nothing matchC acceptC
@@ -201,7 +202,7 @@ scanHaskellCommentLine
201202

202203
{-# SPECIALIZE INLINE
203204
scanHaskellCommentLine
204-
:: Scanner IO Location [Char] (Location, String)
205+
:: Scanner IO Location [Char] (Range Location, String)
205206
#-}
206207

207208

@@ -223,12 +224,13 @@ decodeString ss0
223224
Just (Nothing, cs') -> go acc cs'
224225
Nothing -> go (c : acc) cs
225226

226-
-- | Result of reading a character: either a real char, or an empty string that is a
227-
-- successful read, but contains no characters.
228-
-- These empty strings are sometimes required to remove ambiguity: for example,
229-
-- '\SO' and '\SOH' are both valid escapes.
230-
-- To distinguish between the strings ['\SO', 'H'] and ['\SOH'], it is necessary
231-
-- to explicitly terminate the escape for the former: '\SO\&H' means ['\SO', 'H'].
227+
-- | Result of reading a character: either a real char, or an empty string
228+
-- that is a successful read, but contains no characters.
229+
-- These empty strings are sometimes required to remove ambiguity:
230+
-- for example,'\SO' and '\SOH' are both valid escapes.
231+
-- To distinguish between the strings ['\SO', 'H'] and ['\SOH'],
232+
-- it is necessary to explicitly terminate the escape for the former:
233+
-- '\SO\&H' means ['\SO', 'H'].
232234
type CharGap = Maybe Char
233235

234236
-- | Read a character literal, handling escape codes.
@@ -246,13 +248,13 @@ readChar ('\\' : 'o' : cs)
246248

247249
-- Control characters defined by carret characters, like \^G
248250
readChar ('\\' : '^' : c : rest)
249-
| c >= 'A' && c <= 'Z' = Just (Just $ Char.chr (Char.ord c - 1), rest)
250-
| c == '@' = Just (Just $ Char.chr 0, rest)
251-
| c == '[' = Just (Just $ Char.chr 27, rest)
252-
| c == '\\' = Just (Just $ Char.chr 28, rest)
253-
| c == ']' = Just (Just $ Char.chr 29, rest)
254-
| c == '^' = Just (Just $ Char.chr 30, rest)
255-
| c == '_' = Just (Just $ Char.chr 31, rest)
251+
| c >= 'A' && c <= 'Z' = Just (Just $ Char.chr (Char.ord c - 1), rest)
252+
| c == '@' = Just (Just $ Char.chr 0, rest)
253+
| c == '[' = Just (Just $ Char.chr 27, rest)
254+
| c == '\\' = Just (Just $ Char.chr 28, rest)
255+
| c == ']' = Just (Just $ Char.chr 29, rest)
256+
| c == '^' = Just (Just $ Char.chr 30, rest)
257+
| c == '_' = Just (Just $ Char.chr 31, rest)
256258

257259
-- Control characters defined by decimal escape codes.
258260
readChar ('\\' : cs)
@@ -276,16 +278,16 @@ readChar ('\\' : cs)
276278
= let go [] = Nothing
277279
go ((str, c) : moar)
278280
= case List.stripPrefix str cs of
279-
Nothing -> go moar
280-
Just rest -> Just (Just c, rest)
281+
Nothing -> go moar
282+
Just rest -> Just (Just c, rest)
281283

282284
in go escapedChars
283285

284286
-- Just a regular character.
285-
readChar (c : rest) = Just (Just c, rest)
287+
readChar (c : rest) = Just (Just c, rest)
286288

287289
-- Nothing to read.
288-
readChar _ = Nothing
290+
readChar _ = Nothing
289291

290292
escapedChars :: [(String, Char)]
291293
escapedChars

0 commit comments

Comments
 (0)