22
33-- | This "Text.Regex.TDFA.Pattern" module provides the 'Pattern' data
44-- type and its subtypes. This 'Pattern' type is used to represent
5- -- the parsed form of a Regular Expression .
5+ -- the parsed form of a regular expression .
66
77module Text.Regex.TDFA.Pattern
88 (Pattern (.. )
@@ -16,7 +16,7 @@ module Text.Regex.TDFA.Pattern
1616 ,showPattern
1717-- ** Internal use
1818 ,starTrans
19- -- ** Internal use, Operations to support debugging under ghci
19+ -- ** Internal use, operations to support debugging under @ ghci@
2020 ,starTrans' ,simplify' ,dfsPattern
2121 ) where
2222
@@ -30,35 +30,64 @@ import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error)
3030err :: String -> a
3131err = common_error " Text.Regex.TDFA.Pattern"
3232
33- -- | Pattern is the type returned by the regular expression parser.
34- -- This is consumed by the CorePattern module and the tender leaves
35- -- are nibbled by the TNFA module.
36- data Pattern = PEmpty
37- | PGroup (Maybe GroupIndex ) Pattern -- Nothing to indicate non-matching PGroup (Nothing never used!)
38- | POr [Pattern ] -- flattened by starTrans
39- | PConcat [Pattern ] -- flattened by starTrans
40- | PQuest Pattern -- eliminated by starTrans
41- | PPlus Pattern -- eliminated by starTrans
42- | PStar Bool Pattern -- True means mayFirstBeNull is True
43- | PBound Int (Maybe Int ) Pattern -- eliminated by starTrans
44- -- The rest of these need an index of where in the regex string it is from
45- | PCarat { getDoPa :: DoPa }
46- | PDollar { getDoPa :: DoPa }
47- -- The following test and accept a single character
48- | PDot { getDoPa :: DoPa } -- Any character (newline?) at all
49- | PAny { getDoPa :: DoPa ,getPatternSet :: PatternSet } -- Square bracketed things
50- | PAnyNot { getDoPa :: DoPa ,getPatternSet :: PatternSet } -- Inverted square bracketed things
51- | PEscape { getDoPa :: DoPa ,getPatternChar :: Char } -- Backslashed Character
52- | PChar { getDoPa :: DoPa ,getPatternChar :: Char } -- Specific Character
53- -- The following are semantic tags created in starTrans, not the parser
54- | PNonCapture Pattern -- introduced by starTrans
55- | PNonEmpty Pattern -- introduced by starTrans
56- deriving (Eq ,Show )
57-
58- -- | I have not been checking, but this should have the property that
59- -- parsing the resulting string should result in an identical Pattern.
60- -- This is not true if starTrans has created PNonCapture and PNonEmpty
61- -- values or a (PStar False). The contents of a "[ ]" grouping are
33+ -- | 'Pattern' is the type returned by the regular expression parser 'parseRegex'.
34+ -- This is consumed by the "Text.Regex.TDFA.CorePattern" module and the tender leaves
35+ -- are nibbled by the "Text.Regex.TDFA.TNFA" module.
36+ --
37+ -- The 'DoPa' field is the index of the component in the regex string @r@.
38+ data Pattern
39+ = PEmpty
40+ -- ^ @()@, matches the empty string.
41+ | PGroup (Maybe GroupIndex ) Pattern
42+ -- ^ Group @(r)@. @Nothing@ indicates non-matching 'PGroup'
43+ -- (never produced by parser 'parseRegex').
44+ | POr [Pattern ]
45+ -- ^ Alternative @r|s@ (flattened by 'starTrans').
46+ | PConcat [Pattern ]
47+ -- ^ Sequence @rs@ (flattened by 'starTrans').
48+ | PQuest Pattern
49+ -- ^ Zero or one repetitions @r?@ (eliminated by 'starTrans').
50+ | PPlus Pattern
51+ -- ^ One or more repetitions @r+@ (eliminated by 'starTrans').
52+ | PStar Bool Pattern
53+ -- ^ Zero or more repetitions @r*@.
54+ -- @True@ (default) means may accept the empty string on its first iteration.
55+ | PBound Int (Maybe Int ) Pattern
56+ -- ^ Given number or repetitions @r{n}@ or @r{n,m}@
57+ -- (eliminated by 'starTrans').
58+
59+ -- The rest of these need an index of where in the regex string it is from
60+ | PCarat { getDoPa :: DoPa }
61+ -- ^ @^@ matches beginning of input.
62+ | PDollar { getDoPa :: DoPa }
63+ -- ^ @$@ matches end of input.
64+
65+ -- The following test and accept a single character
66+ | PDot { getDoPa :: DoPa }
67+ -- ^ @.@ matches any character.
68+ | PAny { getDoPa :: DoPa , getPatternSet :: PatternSet }
69+ -- ^ Bracket expression @[...]@.
70+ | PAnyNot { getDoPa :: DoPa , getPatternSet :: PatternSet }
71+ -- ^ Inverted bracket expression @[^...]@.
72+ | PEscape { getDoPa :: DoPa , getPatternChar :: Char }
73+ -- ^ Backslashed character @\c@, may have special meaning.
74+ | PChar { getDoPa :: DoPa , getPatternChar :: Char }
75+ -- ^ Single character, matches given character.
76+
77+ -- The following are semantic tags created in starTrans, not the parser
78+ | PNonCapture Pattern
79+ -- ^ Tag for internal use, introduced by 'starTrans'.
80+ | PNonEmpty Pattern
81+ -- ^ Tag for internal use, introduced by 'starTrans'.
82+ deriving (Eq , Show )
83+
84+ -- Andreas Abel, 2022-07-18, issue #47:
85+ -- The following claim is FALSE:
86+ --
87+ -- I have not been checking, but this should have the property that
88+ -- parsing the resulting string should result in an identical 'Pattern'.
89+ -- This is not true if 'starTrans' has created 'PNonCapture' and 'PNonEmpty'
90+ -- values or a @'PStar' False@. The contents of a @[...]@ grouping are
6291-- always shown in a sorted canonical order.
6392showPattern :: Pattern -> String
6493showPattern pIn =
@@ -93,12 +122,19 @@ showPattern pIn =
93122-}
94123 paren s = (' (' : s)++ " )"
95124
125+ -- | Content of a bracket expression @[...]@ organized into
126+ -- characters,
127+ -- POSIX character classes (e.g. @[[:alnum:]]@),
128+ -- collating elements (e.g. @[.ch.]@, unused), and
129+ -- equivalence classes (e.g. @[=a=]@, treated as characters).
130+ --
96131data PatternSet = PatternSet (Maybe (Set Char ))
97132 (Maybe (Set PatternSetCharacterClass ))
98133 (Maybe (Set PatternSetCollatingElement ))
99134 (Maybe (Set PatternSetEquivalenceClass ))
100135 deriving (Eq )
101136
137+ -- | Hand-rolled implementation, giving textual rather than Haskell representation.
102138instance Show PatternSet where
103139 showsPrec i (PatternSet s scc sce sec) =
104140 let (special,normal) = maybe (" " ," " ) ((partition (`elem` " ]-" )) . Set. toAscList) s
@@ -118,17 +154,27 @@ instance Show PatternSet where
118154 groupRange x n [] = if n <= 3 then take n [x.. ]
119155 else x: ' -' : (toEnum (pred n+ fromEnum x)): []
120156
157+ -- | Content of @[: :]@, e.g. @"alnum"@ for @[:alnum:]@.
121158newtype PatternSetCharacterClass = PatternSetCharacterClass { unSCC :: String }
122159 deriving (Eq ,Ord )
160+
161+ -- | Content of @[. .]@, e.g. @"ch"@ for @[.ch.]@.
123162newtype PatternSetCollatingElement = PatternSetCollatingElement { unSCE :: String }
124163 deriving (Eq ,Ord )
164+
165+ -- | Content of @[= =]@, e.g. @"a"@ for @[=a=]@.
125166newtype PatternSetEquivalenceClass = PatternSetEquivalenceClass { unSEC :: String }
126167 deriving (Eq ,Ord )
127168
169+ -- | Hand-rolled implementation, giving textual rather than Haskell representation.
128170instance Show PatternSetCharacterClass where
129171 showsPrec _ p = showChar ' [' . showChar ' :' . shows (unSCC p) . showChar ' :' . showChar ' ]'
172+
173+ -- | Hand-rolled implementation, giving textual rather than Haskell representation.
130174instance Show PatternSetCollatingElement where
131175 showsPrec _ p = showChar ' [' . showChar ' .' . shows (unSCE p) . showChar ' .' . showChar ' ]'
176+
177+ -- | Hand-rolled implementation, giving textual rather than Haskell representation.
132178instance Show PatternSetEquivalenceClass where
133179 showsPrec _ p = showChar ' [' . showChar ' =' . shows (unSEC p) . showChar ' =' . showChar ' ]'
134180
@@ -165,18 +211,18 @@ decodeCharacterClass (PatternSetCharacterClass s) =
165211-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- ==
166212
167213-- | Do the transformation and simplification in a single traversal.
168- -- This removes the PPlus, PQuest, and PBound values, changing to POr
169- -- and PEmpty and PStar True\/False . For some PBound values it adds
170- -- PNonEmpty and PNonCapture semantic marker. It also simplifies to
171- -- flatten out nested POr and PConcat instances and eliminate some
172- -- unneeded PEmpty values.
214+ -- This removes the ' PPlus', ' PQuest' , and ' PBound' values, changing to ' POr'
215+ -- and ' PEmpty' and ' PStar' . For some ' PBound' values it adds
216+ -- ' PNonEmpty' and ' PNonCapture' semantic marker. It also simplifies to
217+ -- flatten out nested ' POr' and ' PConcat' instances and eliminate some
218+ -- unneeded ' PEmpty' values.
173219starTrans :: Pattern -> Pattern
174220starTrans = dfsPattern (simplify' . starTrans')
175221
176- -- | Apply a Pattern transformation function depth first
177- dfsPattern :: (Pattern -> Pattern ) -- ^ The transformation function
178- -> Pattern -- ^ The Pattern to transform
179- -> Pattern -- ^ The transformed Pattern
222+ -- | Apply a ' Pattern' transformation function depth first.
223+ dfsPattern :: (Pattern -> Pattern ) -- ^ The transformation function.
224+ -> Pattern -- ^ The ' Pattern' to transform.
225+ -> Pattern -- ^ The transformed ' Pattern'.
180226dfsPattern f = dfs
181227 where unary c = f . c . dfs
182228 dfs pattern = case pattern of
@@ -354,7 +400,7 @@ starTrans' pIn =
354400 pass = pIn
355401
356402-- | Function to transform a pattern into an equivalent, but less
357- -- redundant form. Nested 'POr' and 'PConcat' are flattened. PEmpty
403+ -- redundant form. Nested 'POr' and 'PConcat' are flattened. ' PEmpty'
358404-- is propagated.
359405simplify' :: Pattern -> Pattern
360406simplify' x@ (POr _) =
@@ -376,7 +422,7 @@ simplify' (PNonCapture PEmpty) = PEmpty -- 2009, perhaps useful
376422-- simplify' (PNonEmpty PEmpty) = err "simplify' (PNonEmpty PEmpty) = should be Impossible!" -- 2009
377423simplify' other = other
378424
379- -- | Function to flatten nested POr or nested PConcat applicataions.
425+ -- | Function to flatten nested ' POr' or nested ' PConcat' applicataions.
380426flatten :: Pattern -> [Pattern ]
381427flatten (POr ps) = (concatMap (\ x -> case x of
382428 POr ps' -> ps'
@@ -390,8 +436,8 @@ notPEmpty :: Pattern -> Bool
390436notPEmpty PEmpty = False
391437notPEmpty _ = True
392438
393- -- | Determines if pIn will fail or accept [] and never accept any
394- -- characters. Treat PCarat and PDollar as True.
439+ -- | Determines if 'Pattern' will fail or accept @[]@ and never accept any
440+ -- characters. Treat ' PCarat' and ' PDollar' as @ True@ .
395441canOnlyMatchNull :: Pattern -> Bool
396442canOnlyMatchNull pIn =
397443 case pIn of
0 commit comments