5566-- Stability : experimental
77--
8+ -- General character property related functions.
9+ --
810module Unicode.Char.General
9- ( isLetter
11+ (
12+ -- * Character Properties
13+ isLetter
1014 , isSpace
1115
12- -- Hangul
13- , hangulFirst
14- , hangulLast
15- , isHangul
16- , isHangulLV
17-
16+ -- * Korean Hangul Characters
17+ -- | The Hangul script used in the Korean writing system consists of
18+ -- individual consonant and vowel letters (jamo) that are visually combined
19+ -- into square display cells to form entire syllable blocks. Hangul
20+ -- syllables may be encoded directly as precomposed combinations of
21+ -- individual jamo or as decomposed sequences of conjoining jamo. Modern
22+ -- Hangul syllable blocks can be expressed with either two or three jamo,
23+ -- either in the form consonant + vowel or in the form consonant +
24+ -- vowel + consonant. The leading consonant is represented as L, the vowel
25+ -- as V and the trailing consonant as T.
26+ --
27+ -- The Unicode Standard contains both a large set of precomposed modern
28+ -- Hangul syllables and a set of conjoining Hangul jamo, which can be used
29+ -- to encode archaic Korean syllable blocks as well as modern Korean
30+ -- syllable blocks.
31+ --
32+ -- Hangul characters can be composed or decomposed algorithmically instead
33+ -- of via mappings. These APIs are used mainly for Unicode normalization
34+ -- of Hangul text.
35+ --
36+ -- Please refer to the following resources for more information:
37+ --
38+ -- * The @Hangul@ section of the @East Asia@ chapter of the [Unicode Standard](https://www.unicode.org/versions/latest)
39+ -- * Conformance chapter of the [Unicode Standard](https://www.unicode.org/versions/latest)
40+ -- * [Unicode® Standard Annex #15 - Unicode Normalization Forms](https://www.unicode.org/reports/tr15)
41+ -- * UCD file @HangulSyllableType.txt@
42+ -- * https://en.wikipedia.org/wiki/Hangul_Jamo_(Unicode_block)
43+ -- * https://en.wikipedia.org/wiki/List_of_Hangul_jamo
44+
45+ -- ** Conjoining Jamo
46+ -- | Jamo L, V and T letters.
1847 , isJamo
48+ , jamoNCount
49+
50+ -- *** Jamo Leading (L)
1951 , jamoLFirst
2052 , jamoLIndex
2153 , jamoLLast
2254
55+ -- *** Jamo Vowel (V)
2356 , jamoVFirst
2457 , jamoVCount
2558 , jamoVIndex
2659 , jamoVLast
2760
61+ -- *** Jamo Trailing (T)
2862 , jamoTFirst
2963 , jamoTCount
3064 , jamoTIndex
31- , jamoLast
65+ , jamoTLast
3266
33- , jamoNCount
67+ -- ** Hangul Syllables
68+ -- | Precomposed Hangul syllables.
69+ , hangulFirst
70+ , hangulLast
71+ , isHangul
72+ , isHangulLV
3473 )
3574where
3675
@@ -45,7 +84,7 @@ import qualified Unicode.Internal.Char.PropList as P
4584-- and title-case letters, plus letters of caseless scripts and modifiers
4685-- letters).
4786--
48- -- prop> isLetter == Data.Char.isLetter
87+ -- prop> isLetter c == Data.Char.isLetter c
4988--
5089{-# INLINE isLetter #-}
5190isLetter :: Char -> Bool
@@ -54,88 +93,110 @@ isLetter = P.isAlphabetic
5493-- | Returns 'True' for any whitespace characters, and the control
5594-- characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@.
5695--
57- -- prop> isSpace == Data.Char.isSpace
96+ -- prop> isSpace c == Data.Char.isSpace c
5897--
5998{-# INLINE isSpace #-}
6099isSpace :: Char -> Bool
61100isSpace = P. isWhite_Space
62101
63102-------------------------------------------------------------------------------
64- -- Hangul
103+ -- Korean Hangul
65104-------------------------------------------------------------------------------
66105
67- -- General utilities used by decomposition as well as composition
68- -- Hangul characters can be decomposed algorithmically instead of via mappings
69-
70- -- * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
71- -- * https://en.wikipedia.org/wiki/List_of_Hangul_jamo
72- -- * https://www.unicode.org/reports/tr15/tr15-18.html#Hangul
73-
74- -- D134 Standard Korean syllable block: A sequence of one or more L followed
75- -- by a sequence of one or more V and a sequence of zero or more T,
76- -- or any other sequence that is canonically equivalent
77-
78106-- jamo leading
79107jamoLFirst , jamoLCount , jamoLLast :: Int
108+
109+ -- | First leading consonant jamo.
80110jamoLFirst = 0x1100
111+
112+ -- | Total count of leading consonant jamo.
81113jamoLCount = 19
114+
115+ -- | Last leading consonant jamo.
82116jamoLLast = jamoLFirst + jamoLCount - 1
83117
84118-- jamo vowel
85119jamoVFirst , jamoVCount , jamoVLast :: Int
120+
121+ -- | First vowel jamo.
86122jamoVFirst = 0x1161
123+
124+ -- | Total count of vowel jamo.
87125jamoVCount = 21
126+
127+ -- | Last vowel jamo.
88128jamoVLast = jamoVFirst + jamoVCount - 1
89129
90130-- jamo trailing
91- -- jamoTFirst does not represent a valid T, it represents a missing T i.e. LV
92- -- without a T. See comments under jamoTIndex .
93131jamoTFirst , jamoTCount :: Int
132+
133+ -- | The first trailing consonant jamo.
134+ --
135+ -- Note that 'jamoTFirst' does not represent a valid T, it represents a missing
136+ -- T i.e. LV without a T. See comments under 'jamoTIndex' .
94137jamoTFirst = 0x11a7
138+
139+ -- | Total count of trailing consonant jamo.
95140jamoTCount = 28
96141
97- jamoLast :: Int
98- jamoLast = jamoTFirst + jamoTCount - 1
142+ -- | Last trailing consonant jamo.
143+ jamoTLast :: Int
144+ jamoTLast = jamoTFirst + jamoTCount - 1
99145
100- -- VCount * TCount
146+ -- | Total count of all jamo characters.
147+ --
148+ -- @jamoNCount = jamoVCount * jamoTCount@
101149jamoNCount :: Int
102150jamoNCount = 588
103151
104152-- hangul
105153hangulFirst , hangulLast :: Int
154+
155+ -- | Codepoint of the first pre-composed Hangul character.
106156hangulFirst = 0xac00
157+
158+ -- | Codepoint of the last Hangul character.
107159hangulLast = hangulFirst + jamoLCount * jamoVCount * jamoTCount - 1
108160
161+ -- | Determine if the given character is a precomposed Hangul syllable.
109162isHangul :: Char -> Bool
110163isHangul c = n >= hangulFirst && n <= hangulLast
111164 where n = ord c
112165
166+ -- | Determine if the given character is a Hangul LV syllable.
113167isHangulLV :: Char -> Bool
114168isHangulLV c = assert (jamoTCount == 28 )
115169 snd (quotRem28 (ord c - hangulFirst)) == 0
116170
171+ -- | Determine whether a character is a jamo L, V or T character.
117172isJamo :: Char -> Bool
118- isJamo c = n >= jamoLFirst && n <= jamoLast
173+ isJamo c = n >= jamoLFirst && n <= jamoTLast
119174 where n = ord c
120175
121- -- if it is a jamo L char return the index
176+ -- | Given a Unicode character, if it is a leading jamo, return its index in
177+ -- the list of leading jamo consonants, otherwise return 'Nothing'.
122178jamoLIndex :: Char -> Maybe Int
123179jamoLIndex c
124180 | index >= 0 && index < jamoLCount = Just index
125181 | otherwise = Nothing
126182 where index = ord c - jamoLFirst
127183
184+ -- | Given a Unicode character, if it is a vowel jamo, return its index in the
185+ -- list of vowel jamo, otherwise return 'Nothing'.
128186jamoVIndex :: Char -> Maybe Int
129187jamoVIndex c
130188 | index >= 0 && index < jamoVCount = Just index
131189 | otherwise = Nothing
132190 where index = ord c - jamoVFirst
133191
192+ -- | Given a Unicode character, if it is a trailing jamo consonant, return its
193+ -- index in the list of trailing jamo consonants, otherwise return 'Nothing'.
194+ --
134195-- Note that index 0 is not a valid index for a trailing consonant. Index 0
135- -- means no T, only LV syllable.
136- -- See Unicode 9.0.0: 3.12 (Hangul Syllable Decomposition)
137- -- TBase is set to one less than the beginning of the range of trailing
138- -- consonants, which starts at U+11A8.
196+ -- corresponds to an LV syllable, without a T. See "Hangul Syllable
197+ -- Decomposition" in the Conformance chapter of the Unicode standard for more
198+ -- details.
199+ --
139200jamoTIndex :: Char -> Maybe Int
140201jamoTIndex c
141202 | index > 0 && index < jamoTCount = Just index
0 commit comments