Skip to content

Commit d2296bd

Browse files
committed
core: Add Types of Code Points
1 parent a247a95 commit d2296bd

File tree

1 file changed

+127
-6
lines changed

1 file changed

+127
-6
lines changed

unicode-data/lib/Unicode/Char/General.hs

Lines changed: 127 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,16 @@
88
-- General character property related functions.
99
--
1010
module Unicode.Char.General
11-
(
12-
-- * Unicode general categories
13-
GeneralCategory(..)
11+
( -- * Types of Code Points
12+
CodePointType(..)
13+
, codePointType
14+
15+
-- * Unicode general categories
16+
, GeneralCategory(..)
1417
, generalCategoryAbbr
1518
, generalCategory
1619

17-
-- * Character classification
20+
-- * Character classification
1821
, isAlphabetic
1922
, isAlphaNum
2023
, isControl
@@ -25,9 +28,12 @@ module Unicode.Char.General
2528
, isSymbol
2629
, isWhiteSpace
2730
, isNoncharacter
31+
32+
-- ** Deprecated
2833
, isLetter
2934
, isSpace
30-
-- ** Re-export
35+
36+
-- ** Re-export
3137
, isAscii
3238
, isLatin1
3339
, isAsciiUpper
@@ -95,6 +101,7 @@ module Unicode.Char.General
95101
where
96102

97103
import Control.Exception (assert)
104+
import Data.Bits ((.&.))
98105
import Data.Char (isAscii, isLatin1, isAsciiUpper, isAsciiLower, ord)
99106
import Data.Ix (Ix)
100107
import Unicode.Internal.Division (quotRem28)
@@ -103,11 +110,15 @@ import qualified Unicode.Internal.Char.DerivedCoreProperties as P
103110
import qualified Unicode.Internal.Char.PropList as P
104111
import qualified Unicode.Internal.Char.UnicodeData.GeneralCategory as UC
105112

113+
--------------------------------------------------------------------------------
114+
-- General Category
115+
--------------------------------------------------------------------------------
116+
106117
{-| Unicode General Categories.
107118
108119
These classes are defined in the
109120
[Unicode Character Database](http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table),
110-
part of the Unicode standard
121+
part of the Unicode standard.
111122
112123
__Note:__ the classes must be in the same order they are listed in the Unicode Standard,
113124
because some functions (e.g. 'generalCategory') rely on the 'Enum' instance.
@@ -217,6 +228,116 @@ prop> show (generalCategory c) == show (Data.Char.generalCategory c)
217228
generalCategory :: Char -> GeneralCategory
218229
generalCategory = toEnum . UC.generalCategory
219230

231+
--------------------------------------------------------------------------------
232+
-- Types of Code Points
233+
--------------------------------------------------------------------------------
234+
235+
-- | Types of Code Points.
236+
--
237+
-- These classes are defined in the section
238+
-- [2.4 “Code Points and Characters”](https://www.unicode.org/versions/Unicode15.0.0/ch02.pdf#G14527)
239+
-- of the Unicode standard.
240+
--
241+
-- @since 0.4.1
242+
data CodePointType
243+
= GraphicType
244+
-- ^ __Graphic__: defined by the following general categories:
245+
--
246+
-- * Letters (L): 'UppercaseLetter', 'LowercaseLetter', 'TitlecaseLetter',
247+
-- 'ModifierLetter', 'OtherLetter'.
248+
-- * Marks (M): 'NonSpacingMark', 'SpacingCombiningMark', 'EnclosingMark'.
249+
-- * Numbers (N): 'DecimalNumber', 'LetterNumber', 'OtherNumber'.
250+
-- * Punctuation (P): 'ConnectorPunctuation', 'DashPunctuation',
251+
-- 'OpenPunctuation', 'ClosePunctuation', 'InitialQuote', 'FinalQuote',
252+
-- 'OtherPunctuation'.
253+
-- * Symbol (S): 'MathSymbol', 'CurrencySymbol', 'ModifierSymbol',
254+
-- 'OtherSymbol'.
255+
-- * Separators: 'Space'.
256+
| FormatType
257+
-- ^ __Format__: invisible but affects neighboring characters.
258+
--
259+
-- Defined by the following general categories:
260+
-- 'LineSeparator', 'ParagraphSeparator', 'Format'.
261+
| ControlType
262+
-- ^ __Control__: usage defined by protocols or standards outside the
263+
-- Unicode Standard.
264+
--
265+
-- Defined by the general category 'Control'.
266+
| PrivateUseType
267+
-- ^ __Private-use__: usage defined by private agreement outside the
268+
-- Unicode Standard.
269+
--
270+
-- Defined by the general category 'PrivateUse'.
271+
| SurrogateType
272+
-- ^ __Surrogate__: Permanently reserved for UTF-16.
273+
--
274+
-- Defined by the general category 'Surrogate'.
275+
| NoncharacterType
276+
-- ^ __Noncharacter:__ a code point that is permanently reserved for
277+
-- internal use (see definition D14 in the section
278+
-- [3.4 “Characters and Encoding”](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G2212)
279+
-- of the Unicode Standard).
280+
-- Noncharacters consist of the values @U+nFFFE@ and @U+nFFFF@ (where @n@
281+
-- is from 0 to 10₁₆) and the values @U+FDD0..U+FDEF@.
282+
--
283+
-- They are a subset of the general category 'NotAssigned'.
284+
| ReservedType
285+
-- ^ __Reserved:__ any code point of the Unicode Standard that is reserved
286+
-- for future assignment (see definition D15 in the section
287+
-- [3.4 “Characters and Encoding”](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G2212)
288+
-- of the Unicode Standard). Also known as an unassigned code point.
289+
--
290+
-- They are a subset of the general category 'NotAssigned'.
291+
deriving ( Show
292+
, Eq
293+
, Ord
294+
, Enum
295+
, Bounded
296+
, Ix
297+
)
298+
299+
-- | Returns the 'CodePointType' of a character.
300+
--
301+
-- @since 0.6.0
302+
codePointType :: Char -> CodePointType
303+
codePointType c = case generalCategory c of
304+
UppercaseLetter -> GraphicType
305+
LowercaseLetter -> GraphicType
306+
TitlecaseLetter -> GraphicType
307+
ModifierLetter -> GraphicType
308+
OtherLetter -> GraphicType
309+
NonSpacingMark -> GraphicType
310+
SpacingCombiningMark -> GraphicType
311+
EnclosingMark -> GraphicType
312+
DecimalNumber -> GraphicType
313+
LetterNumber -> GraphicType
314+
OtherNumber -> GraphicType
315+
ConnectorPunctuation -> GraphicType
316+
DashPunctuation -> GraphicType
317+
OpenPunctuation -> GraphicType
318+
ClosePunctuation -> GraphicType
319+
InitialQuote -> GraphicType
320+
FinalQuote -> GraphicType
321+
OtherPunctuation -> GraphicType
322+
MathSymbol -> GraphicType
323+
CurrencySymbol -> GraphicType
324+
ModifierSymbol -> GraphicType
325+
OtherSymbol -> GraphicType
326+
Space -> GraphicType
327+
LineSeparator -> FormatType
328+
ParagraphSeparator -> FormatType
329+
Control -> ControlType
330+
Format -> FormatType
331+
Surrogate -> SurrogateType
332+
PrivateUse -> PrivateUseType
333+
NotAssigned
334+
| isNoncharacter c -> NoncharacterType
335+
| otherwise -> ReservedType
336+
337+
--------------------------------------------------------------------------------
338+
-- Predicates
339+
--------------------------------------------------------------------------------
340+
220341
{-| Returns 'True' for alphabetic Unicode characters (lower-case, upper-case
221342
and title-case letters, plus letters of caseless scripts and modifiers
222343
letters).

0 commit comments

Comments
 (0)