88-- General character property related functions.
99--
1010module Unicode.Char.General
11- (
12- -- * Unicode general categories
13- GeneralCategory (.. )
11+ ( -- * Types of Code Points
12+ CodePointType (.. )
13+ , codePointType
14+
15+ -- * Unicode general categories
16+ , GeneralCategory (.. )
1417 , generalCategoryAbbr
1518 , generalCategory
1619
17- -- * Character classification
20+ -- * Character classification
1821 , isAlphabetic
1922 , isAlphaNum
2023 , isControl
@@ -25,9 +28,12 @@ module Unicode.Char.General
2528 , isSymbol
2629 , isWhiteSpace
2730 , isNoncharacter
31+
32+ -- ** Deprecated
2833 , isLetter
2934 , isSpace
30- -- ** Re-export
35+
36+ -- ** Re-export
3137 , isAscii
3238 , isLatin1
3339 , isAsciiUpper
@@ -95,6 +101,7 @@ module Unicode.Char.General
95101where
96102
97103import Control.Exception (assert )
104+ import Data.Bits ((.&.) )
98105import Data.Char (isAscii , isLatin1 , isAsciiUpper , isAsciiLower , ord )
99106import Data.Ix (Ix )
100107import Unicode.Internal.Division (quotRem28 )
@@ -103,11 +110,15 @@ import qualified Unicode.Internal.Char.DerivedCoreProperties as P
103110import qualified Unicode.Internal.Char.PropList as P
104111import qualified Unicode.Internal.Char.UnicodeData.GeneralCategory as UC
105112
113+ --------------------------------------------------------------------------------
114+ -- General Category
115+ --------------------------------------------------------------------------------
116+
106117{-| Unicode General Categories.
107118
108119These classes are defined in the
109120[Unicode Character Database](http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table),
110- part of the Unicode standard
121+ part of the Unicode standard.
111122
112123__Note:__ the classes must be in the same order they are listed in the Unicode Standard,
113124because some functions (e.g. 'generalCategory') rely on the 'Enum' instance.
@@ -217,6 +228,116 @@ prop> show (generalCategory c) == show (Data.Char.generalCategory c)
217228generalCategory :: Char -> GeneralCategory
218229generalCategory = toEnum . UC. generalCategory
219230
231+ --------------------------------------------------------------------------------
232+ -- Types of Code Points
233+ --------------------------------------------------------------------------------
234+
235+ -- | Types of Code Points.
236+ --
237+ -- These classes are defined in the section
238+ -- [2.4 “Code Points and Characters”](https://www.unicode.org/versions/Unicode15.0.0/ch02.pdf#G14527)
239+ -- of the Unicode standard.
240+ --
241+ -- @since 0.4.1
242+ data CodePointType
243+ = GraphicType
244+ -- ^ __Graphic__: defined by the following general categories:
245+ --
246+ -- * Letters (L): 'UppercaseLetter', 'LowercaseLetter', 'TitlecaseLetter',
247+ -- 'ModifierLetter', 'OtherLetter'.
248+ -- * Marks (M): 'NonSpacingMark', 'SpacingCombiningMark', 'EnclosingMark'.
249+ -- * Numbers (N): 'DecimalNumber', 'LetterNumber', 'OtherNumber'.
250+ -- * Punctuation (P): 'ConnectorPunctuation', 'DashPunctuation',
251+ -- 'OpenPunctuation', 'ClosePunctuation', 'InitialQuote', 'FinalQuote',
252+ -- 'OtherPunctuation'.
253+ -- * Symbol (S): 'MathSymbol', 'CurrencySymbol', 'ModifierSymbol',
254+ -- 'OtherSymbol'.
255+ -- * Separators: 'Space'.
256+ | FormatType
257+ -- ^ __Format__: invisible but affects neighboring characters.
258+ --
259+ -- Defined by the following general categories:
260+ -- 'LineSeparator', 'ParagraphSeparator', 'Format'.
261+ | ControlType
262+ -- ^ __Control__: usage defined by protocols or standards outside the
263+ -- Unicode Standard.
264+ --
265+ -- Defined by the general category 'Control'.
266+ | PrivateUseType
267+ -- ^ __Private-use__: usage defined by private agreement outside the
268+ -- Unicode Standard.
269+ --
270+ -- Defined by the general category 'PrivateUse'.
271+ | SurrogateType
272+ -- ^ __Surrogate__: Permanently reserved for UTF-16.
273+ --
274+ -- Defined by the general category 'Surrogate'.
275+ | NoncharacterType
276+ -- ^ __Noncharacter:__ a code point that is permanently reserved for
277+ -- internal use (see definition D14 in the section
278+ -- [3.4 “Characters and Encoding”](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G2212)
279+ -- of the Unicode Standard).
280+ -- Noncharacters consist of the values @U+nFFFE@ and @U+nFFFF@ (where @n@
281+ -- is from 0 to 10₁₆) and the values @U+FDD0..U+FDEF@.
282+ --
283+ -- They are a subset of the general category 'NotAssigned'.
284+ | ReservedType
285+ -- ^ __Reserved:__ any code point of the Unicode Standard that is reserved
286+ -- for future assignment (see definition D15 in the section
287+ -- [3.4 “Characters and Encoding”](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G2212)
288+ -- of the Unicode Standard). Also known as an unassigned code point.
289+ --
290+ -- They are a subset of the general category 'NotAssigned'.
291+ deriving ( Show
292+ , Eq
293+ , Ord
294+ , Enum
295+ , Bounded
296+ , Ix
297+ )
298+
299+ -- | Returns the 'CodePointType' of a character.
300+ --
301+ -- @since 0.6.0
302+ codePointType :: Char -> CodePointType
303+ codePointType c = case generalCategory c of
304+ UppercaseLetter -> GraphicType
305+ LowercaseLetter -> GraphicType
306+ TitlecaseLetter -> GraphicType
307+ ModifierLetter -> GraphicType
308+ OtherLetter -> GraphicType
309+ NonSpacingMark -> GraphicType
310+ SpacingCombiningMark -> GraphicType
311+ EnclosingMark -> GraphicType
312+ DecimalNumber -> GraphicType
313+ LetterNumber -> GraphicType
314+ OtherNumber -> GraphicType
315+ ConnectorPunctuation -> GraphicType
316+ DashPunctuation -> GraphicType
317+ OpenPunctuation -> GraphicType
318+ ClosePunctuation -> GraphicType
319+ InitialQuote -> GraphicType
320+ FinalQuote -> GraphicType
321+ OtherPunctuation -> GraphicType
322+ MathSymbol -> GraphicType
323+ CurrencySymbol -> GraphicType
324+ ModifierSymbol -> GraphicType
325+ OtherSymbol -> GraphicType
326+ Space -> GraphicType
327+ LineSeparator -> FormatType
328+ ParagraphSeparator -> FormatType
329+ Control -> ControlType
330+ Format -> FormatType
331+ Surrogate -> SurrogateType
332+ PrivateUse -> PrivateUseType
333+ NotAssigned
334+ | isNoncharacter c -> NoncharacterType
335+ | otherwise -> ReservedType
336+
337+ --------------------------------------------------------------------------------
338+ -- Predicates
339+ --------------------------------------------------------------------------------
340+
220341{-| Returns 'True' for alphabetic Unicode characters (lower-case, upper-case
221342and title-case letters, plus letters of caseless scripts and modifiers
222343letters).
0 commit comments