Skip to content

Commit 2647798

Browse files
committed
Extend and polish haddock for Pattern module
1 parent 363d4af commit 2647798

File tree

2 files changed

+96
-50
lines changed

2 files changed

+96
-50
lines changed

lib/Text/Regex/TDFA/Common.hs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -109,26 +109,26 @@ data ExecOption = ExecOption {
109109
captureGroups :: Bool -- ^ True by default. Set to False to improve speed (and space).
110110
} deriving (Read,Show)
111111

112-
-- | Used by implementation to name certain Postions during
113-
-- matching. Identity of Position tag to set during a transition.
112+
-- | Used by implementation to name certain 'Postion's during
113+
-- matching. Identity of 'Position' tag to set during a transition.
114114
type Tag = Int
115115

116-
-- | Internal use to indicate type of tag and preference for larger or smaller Positions.
116+
-- | Internal use to indicate type of tag and preference for larger or smaller 'Position's.
117117
data OP = Maximize | Minimize | Orbit | Ignore deriving (Eq,Show)
118118

119119
-- | Internal NFA node identity number.
120120
type Index = Int
121121

122-
-- | Internal DFA identity is this Set of NFA Index.
122+
-- | Internal DFA identity is this 'Set' of NFA 'Index'.
123123
type SetIndex = IntSet {- Index -}
124124

125125
-- | Index into the text being searched.
126126
type Position = Int
127127

128-
-- | GroupIndex is for indexing submatches from capturing parenthesized groups ('PGroup' or 'Group').
128+
-- | 'GroupIndex' is for indexing submatches from capturing parenthesized groups ('PGroup' or 'Group').
129129
type GroupIndex = Int
130130

131-
-- | GroupInfo collects the parent and tag information for an instance of a group.
131+
-- | 'GroupInfo' collects the parent and tag information for an instance of a group.
132132
data GroupInfo = GroupInfo {
133133
thisIndex, parentIndex :: GroupIndex
134134
, startTag, stopTag, flagTag :: Tag

lib/Text/Regex/TDFA/Pattern.hs

Lines changed: 90 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
-- | This "Text.Regex.TDFA.Pattern" module provides the 'Pattern' data
44
-- type and its subtypes. This 'Pattern' type is used to represent
5-
-- the parsed form of a Regular Expression.
5+
-- the parsed form of a regular expression.
66

77
module Text.Regex.TDFA.Pattern
88
(Pattern(..)
@@ -16,7 +16,7 @@ module Text.Regex.TDFA.Pattern
1616
,showPattern
1717
-- ** Internal use
1818
,starTrans
19-
-- ** Internal use, Operations to support debugging under ghci
19+
-- ** Internal use, operations to support debugging under @ghci@
2020
,starTrans',simplify',dfsPattern
2121
) where
2222

@@ -30,35 +30,64 @@ import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error)
3030
err :: String -> a
3131
err = common_error "Text.Regex.TDFA.Pattern"
3232

33-
-- | Pattern is the type returned by the regular expression parser.
34-
-- This is consumed by the CorePattern module and the tender leaves
35-
-- are nibbled by the TNFA module.
36-
data Pattern = PEmpty
37-
| PGroup (Maybe GroupIndex) Pattern -- Nothing to indicate non-matching PGroup (Nothing never used!)
38-
| POr [Pattern] -- flattened by starTrans
39-
| PConcat [Pattern] -- flattened by starTrans
40-
| PQuest Pattern -- eliminated by starTrans
41-
| PPlus Pattern -- eliminated by starTrans
42-
| PStar Bool Pattern -- True means mayFirstBeNull is True
43-
| PBound Int (Maybe Int) Pattern -- eliminated by starTrans
44-
-- The rest of these need an index of where in the regex string it is from
45-
| PCarat {getDoPa::DoPa}
46-
| PDollar {getDoPa::DoPa}
47-
-- The following test and accept a single character
48-
| PDot {getDoPa::DoPa} -- Any character (newline?) at all
49-
| PAny {getDoPa::DoPa,getPatternSet::PatternSet} -- Square bracketed things
50-
| PAnyNot {getDoPa::DoPa,getPatternSet::PatternSet} -- Inverted square bracketed things
51-
| PEscape {getDoPa::DoPa,getPatternChar::Char} -- Backslashed Character
52-
| PChar {getDoPa::DoPa,getPatternChar::Char} -- Specific Character
53-
-- The following are semantic tags created in starTrans, not the parser
54-
| PNonCapture Pattern -- introduced by starTrans
55-
| PNonEmpty Pattern -- introduced by starTrans
56-
deriving (Eq,Show)
57-
58-
-- | I have not been checking, but this should have the property that
59-
-- parsing the resulting string should result in an identical Pattern.
60-
-- This is not true if starTrans has created PNonCapture and PNonEmpty
61-
-- values or a (PStar False). The contents of a "[ ]" grouping are
33+
-- | 'Pattern' is the type returned by the regular expression parser 'parseRegex'.
34+
-- This is consumed by the "Text.Regex.TDFA.CorePattern" module and the tender leaves
35+
-- are nibbled by the "Text.Regex.TDFA.TNFA" module.
36+
--
37+
-- The 'DoPa' field is the index of the component in the regex string @r@.
38+
data Pattern
39+
= PEmpty
40+
-- ^ @()@, matches the empty string.
41+
| PGroup (Maybe GroupIndex) Pattern
42+
-- ^ Group @(r)@. @Nothing@ indicates non-matching 'PGroup'
43+
-- (never produced by parser 'parseRegex').
44+
| POr [Pattern]
45+
-- ^ Alternative @r|s@ (flattened by 'starTrans').
46+
| PConcat [Pattern]
47+
-- ^ Sequence @rs@ (flattened by 'starTrans').
48+
| PQuest Pattern
49+
-- ^ Zero or one repetitions @r?@ (eliminated by 'starTrans').
50+
| PPlus Pattern
51+
-- ^ One or more repetitions @r+@ (eliminated by 'starTrans').
52+
| PStar Bool Pattern
53+
-- ^ Zero or more repetitions @r*@.
54+
-- @True@ (default) means may accept the empty string on its first iteration.
55+
| PBound Int (Maybe Int) Pattern
56+
-- ^ Given number or repetitions @r{n}@ or @r{n,m}@
57+
-- (eliminated by 'starTrans').
58+
59+
-- The rest of these need an index of where in the regex string it is from
60+
| PCarat { getDoPa :: DoPa }
61+
-- ^ @^@ matches beginning of input.
62+
| PDollar { getDoPa :: DoPa }
63+
-- ^ @$@ matches end of input.
64+
65+
-- The following test and accept a single character
66+
| PDot { getDoPa :: DoPa }
67+
-- ^ @.@ matches any character.
68+
| PAny { getDoPa :: DoPa, getPatternSet :: PatternSet }
69+
-- ^ Bracket expression @[...]@.
70+
| PAnyNot { getDoPa :: DoPa, getPatternSet :: PatternSet }
71+
-- ^ Inverted bracket expression @[^...]@.
72+
| PEscape { getDoPa :: DoPa, getPatternChar :: Char }
73+
-- ^ Backslashed character @\c@, may have special meaning.
74+
| PChar { getDoPa :: DoPa, getPatternChar :: Char }
75+
-- ^ Single character, matches given character.
76+
77+
-- The following are semantic tags created in starTrans, not the parser
78+
| PNonCapture Pattern
79+
-- ^ Tag for internal use, introduced by 'starTrans'.
80+
| PNonEmpty Pattern
81+
-- ^ Tag for internal use, introduced by 'starTrans'.
82+
deriving (Eq, Show)
83+
84+
-- Andreas Abel, 2022-07-18, issue #47:
85+
-- The following claim is FALSE:
86+
--
87+
-- I have not been checking, but this should have the property that
88+
-- parsing the resulting string should result in an identical 'Pattern'.
89+
-- This is not true if 'starTrans' has created 'PNonCapture' and 'PNonEmpty'
90+
-- values or a @'PStar' False@. The contents of a @[...]@ grouping are
6291
-- always shown in a sorted canonical order.
6392
showPattern :: Pattern -> String
6493
showPattern pIn =
@@ -93,12 +122,19 @@ showPattern pIn =
93122
-}
94123
paren s = ('(':s)++")"
95124

125+
-- | Content of a bracket expression @[...]@ organized into
126+
-- characters,
127+
-- POSIX character classes (e.g. @[[:alnum:]]@),
128+
-- collating elements (e.g. @[.ch.]@, unused), and
129+
-- equivalence classes (e.g. @[=a=]@, treated as characters).
130+
--
96131
data PatternSet = PatternSet (Maybe (Set Char))
97132
(Maybe (Set PatternSetCharacterClass))
98133
(Maybe (Set PatternSetCollatingElement))
99134
(Maybe (Set PatternSetEquivalenceClass))
100135
deriving (Eq)
101136

137+
-- | Hand-rolled implementation, giving textual rather than Haskell representation.
102138
instance Show PatternSet where
103139
showsPrec i (PatternSet s scc sce sec) =
104140
let (special,normal) = maybe ("","") ((partition (`elem` "]-")) . Set.toAscList) s
@@ -118,17 +154,27 @@ instance Show PatternSet where
118154
groupRange x n [] = if n <=3 then take n [x..]
119155
else x:'-':(toEnum (pred n+fromEnum x)):[]
120156

157+
-- | Content of @[: :]@, e.g. @"alnum"@ for @[:alnum:]@.
121158
newtype PatternSetCharacterClass = PatternSetCharacterClass {unSCC::String}
122159
deriving (Eq,Ord)
160+
161+
-- | Content of @[. .]@, e.g. @"ch"@ for @[.ch.]@.
123162
newtype PatternSetCollatingElement = PatternSetCollatingElement {unSCE::String}
124163
deriving (Eq,Ord)
164+
165+
-- | Content of @[= =]@, e.g. @"a"@ for @[=a=]@.
125166
newtype PatternSetEquivalenceClass = PatternSetEquivalenceClass {unSEC::String}
126167
deriving (Eq,Ord)
127168

169+
-- | Hand-rolled implementation, giving textual rather than Haskell representation.
128170
instance Show PatternSetCharacterClass where
129171
showsPrec _ p = showChar '[' . showChar ':' . shows (unSCC p) . showChar ':' . showChar ']'
172+
173+
-- | Hand-rolled implementation, giving textual rather than Haskell representation.
130174
instance Show PatternSetCollatingElement where
131175
showsPrec _ p = showChar '[' . showChar '.' . shows (unSCE p) . showChar '.' . showChar ']'
176+
177+
-- | Hand-rolled implementation, giving textual rather than Haskell representation.
132178
instance Show PatternSetEquivalenceClass where
133179
showsPrec _ p = showChar '[' . showChar '=' . shows (unSEC p) . showChar '=' . showChar ']'
134180

@@ -165,18 +211,18 @@ decodeCharacterClass (PatternSetCharacterClass s) =
165211
-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- ==
166212

167213
-- | Do the transformation and simplification in a single traversal.
168-
-- This removes the PPlus, PQuest, and PBound values, changing to POr
169-
-- and PEmpty and PStar True\/False. For some PBound values it adds
170-
-- PNonEmpty and PNonCapture semantic marker. It also simplifies to
171-
-- flatten out nested POr and PConcat instances and eliminate some
172-
-- unneeded PEmpty values.
214+
-- This removes the 'PPlus', 'PQuest', and 'PBound' values, changing to 'POr'
215+
-- and 'PEmpty' and 'PStar'. For some 'PBound' values it adds
216+
-- 'PNonEmpty' and 'PNonCapture' semantic marker. It also simplifies to
217+
-- flatten out nested 'POr' and 'PConcat' instances and eliminate some
218+
-- unneeded 'PEmpty' values.
173219
starTrans :: Pattern -> Pattern
174220
starTrans = dfsPattern (simplify' . starTrans')
175221

176-
-- | Apply a Pattern transformation function depth first
177-
dfsPattern :: (Pattern -> Pattern) -- ^ The transformation function
178-
-> Pattern -- ^ The Pattern to transform
179-
-> Pattern -- ^ The transformed Pattern
222+
-- | Apply a 'Pattern' transformation function depth first.
223+
dfsPattern :: (Pattern -> Pattern) -- ^ The transformation function.
224+
-> Pattern -- ^ The 'Pattern' to transform.
225+
-> Pattern -- ^ The transformed 'Pattern'.
180226
dfsPattern f = dfs
181227
where unary c = f . c . dfs
182228
dfs pattern = case pattern of
@@ -354,7 +400,7 @@ starTrans' pIn =
354400
pass = pIn
355401

356402
-- | Function to transform a pattern into an equivalent, but less
357-
-- redundant form. Nested 'POr' and 'PConcat' are flattened. PEmpty
403+
-- redundant form. Nested 'POr' and 'PConcat' are flattened. 'PEmpty'
358404
-- is propagated.
359405
simplify' :: Pattern -> Pattern
360406
simplify' x@(POr _) =
@@ -376,7 +422,7 @@ simplify' (PNonCapture PEmpty) = PEmpty -- 2009, perhaps useful
376422
--simplify' (PNonEmpty PEmpty) = err "simplify' (PNonEmpty PEmpty) = should be Impossible!" -- 2009
377423
simplify' other = other
378424

379-
-- | Function to flatten nested POr or nested PConcat applicataions.
425+
-- | Function to flatten nested 'POr' or nested 'PConcat' applicataions.
380426
flatten :: Pattern -> [Pattern]
381427
flatten (POr ps) = (concatMap (\x -> case x of
382428
POr ps' -> ps'
@@ -390,8 +436,8 @@ notPEmpty :: Pattern -> Bool
390436
notPEmpty PEmpty = False
391437
notPEmpty _ = True
392438

393-
-- | Determines if pIn will fail or accept [] and never accept any
394-
-- characters. Treat PCarat and PDollar as True.
439+
-- | Determines if 'Pattern' will fail or accept @[]@ and never accept any
440+
-- characters. Treat 'PCarat' and 'PDollar' as @True@.
395441
canOnlyMatchNull :: Pattern -> Bool
396442
canOnlyMatchNull pIn =
397443
case pIn of

0 commit comments

Comments
 (0)