Skip to content

Commit 5b5ee64

Browse files
committed
names: add label & nameOrLabel
1 parent d2296bd commit 5b5ee64

File tree

6 files changed

+195
-20
lines changed

6 files changed

+195
-20
lines changed

unicode-data-names/Changelog.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,23 @@
22

33
## 0.4.0 (July 2024)
44

5-
- Update to [Unicode 15.1.0](https://www.unicode.org/versions/Unicode15.1.0/).
5+
- Updated to [Unicode 15.1.0](https://www.unicode.org/versions/Unicode15.1.0/).
6+
- Added `label` and `nameOrLabel` to `Unicode.Char.General.Names`.
67

78
## 0.3.0 (July 2024)
89

9-
- Improve performance.
10+
- Improved performance.
1011
- Added opional support for `ByteString` API.
1112
Use the package flag `has-bytestring` to enable it.
1213
- Added opional support for `Text` API.
1314
Use the package flag `has-text` to enable it.
14-
- Add `unicodeVersion` to `Unicode.Char.General.Names`.
15-
- Fix the inlining of `Addr#` literals and reduce their size. This results in
15+
- Added `unicodeVersion` to `Unicode.Char.General.Names`.
16+
- Fixed the inlining of `Addr#` literals and reduce their size. This results in
1617
a sensible decrease of the executable size.
1718

1819
## 0.2.0 (September 2022)
1920

20-
- Update to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/).
21+
- Updated to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/).
2122

2223
## 0.1.0 (June 2022)
2324

unicode-data-names/bench/Main.hs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,11 @@ benchmarks charRange charFilter = bgroup "All"
172172
]
173173
#endif
174174
]
175+
, bgroup "nameOrLabel"
176+
[ bgroup' "nameOrLabel" "String"
177+
[ Bench "unicode-data" String.nameOrLabel
178+
]
179+
]
175180
, bgroup "nameAliasesByType"
176181
[ bgroup' "nameAliasesByType" "String"
177182
[ Bench "unicode-data"
@@ -220,6 +225,11 @@ benchmarks charRange charFilter = bgroup "All"
220225
]
221226
#endif
222227
]
228+
, bgroup "label"
229+
[ bgroup' "label" "String"
230+
[ Bench "unicode-data" String.label
231+
]
232+
]
223233
]
224234
]
225235
where

unicode-data-names/lib/Unicode/Char/General/Names.hs

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,50 @@
1616
-- @since 0.1.0
1717

1818
module Unicode.Char.General.Names
19-
( -- Unicode version
19+
( -- * Unicode version
2020
unicodeVersion
21+
2122
-- * Name
2223
, name
2324
, nameOrAlias
25+
, nameOrLabel
2426
, correctedName
27+
2528
-- * Name Aliases
2629
, NameAliases.NameAliasType(..)
2730
, nameAliases
2831
, nameAliasesByType
2932
, nameAliasesWithTypes
33+
34+
-- * Label
35+
, label
3036
) where
3137

3238
import Control.Applicative ((<|>))
33-
import GHC.Exts
34-
( Addr#, Char(..), Char#, Int#
35-
, indexCharOffAddr#, plusAddr#, (+#), (-#), (<#), isTrue#, quotRemInt#
36-
, dataToTag#, ord# )
39+
import Control.Monad ((>=>))
40+
import Foreign.C.String (peekCAStringLen)
41+
import GHC.Exts (
42+
Addr#,
43+
Char (..),
44+
Char#,
45+
Int#,
46+
dataToTag#,
47+
indexCharOffAddr#,
48+
isTrue#,
49+
ord#,
50+
plusAddr#,
51+
quotRemInt#,
52+
(+#),
53+
(-#),
54+
(<#),
55+
)
56+
import System.IO.Unsafe (unsafeDupablePerformIO)
3757

3858
import Unicode.Internal.Bits.Names (unpackNBytes#)
59+
import qualified Unicode.Internal.Char.Label as Label
60+
import Unicode.Internal.Char.Names.Version (unicodeVersion)
3961
import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName
4062
import qualified Unicode.Internal.Char.UnicodeData.NameAliases as NameAliases
41-
import Unicode.Internal.Char.Names.Version (unicodeVersion)
4263

4364
-- | Name of a character, if defined.
4465
--
@@ -121,6 +142,15 @@ nameOrAlias c@(C# c#) = name c <|> case indexCharOffAddr# addr# 0# of
121142
0# -> go (t# +# 1#)
122143
i# -> unpackNBytes'# (addr# `plusAddr#` i#)
123144

145+
-- | Returns a character’s 'name' if defined,
146+
-- otherwise returns its label between angle brackets.
147+
--
148+
-- @since 0.4.0
149+
nameOrLabel :: Char -> String
150+
nameOrLabel c = case name c of
151+
Nothing -> '<' : label c ++ ">"
152+
Just n -> n
153+
124154
-- | All name aliases of a character, if defined.
125155
-- The names are listed in the original order of the UCD.
126156
--
@@ -189,6 +219,24 @@ nameAliasesByType# addr# t = case indexCharOffAddr# (addr# `plusAddr#` t#) 0# of
189219
i# -> unpackCStrings# (addr# `plusAddr#` ord# i#)
190220
where t# = dataToTag# t
191221

222+
-- | Returns the label of a code point if it has no character name, otherwise
223+
-- returns @\"UNDEFINED\"@.
224+
--
225+
-- See subsection
226+
-- [“Code Point Labels”](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G135248)
227+
-- in section 4.8 “Name” of the Unicode Standard.
228+
--
229+
-- >>> label '\0'
230+
-- "control-0000"
231+
-- >>> label 'a'
232+
-- "UNDEFINED"
233+
-- >>> label '\xffff'
234+
-- "noncharacter-FFFF"
235+
--
236+
-- @since 0.4.0
237+
label :: Char -> String
238+
label = unsafeDupablePerformIO . (Label.label >=> peekCAStringLen)
239+
192240
{-# INLINE unpackCStrings# #-}
193241
unpackCStrings# :: Addr# -> [String]
194242
unpackCStrings# = go
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
-- |
2+
-- Module : Unicode.Char
3+
-- Copyright : (c) 2024 Composewell Technologies and Contributors
4+
-- License : Apache-2.0
5+
-- Maintainer : [email protected]
6+
-- Stability : experimental
7+
8+
module Unicode.Internal.Char.Label
9+
( label
10+
, addHexCodePoint
11+
, intToDigiT
12+
) where
13+
14+
import Data.Char (ord)
15+
import Data.Functor (($>))
16+
import Foreign.C.String (CString, CStringLen)
17+
import Foreign.C.Types (CChar (..))
18+
import Foreign.Marshal (allocaArray, copyArray)
19+
import Foreign.Storable (Storable (..))
20+
import GHC.Exts (Int (..), Int#, Ptr (..), isTrue#, quotRemInt#, (+#), (-#), (<=#))
21+
import Unicode.Char.General (CodePointType (..), codePointType)
22+
23+
-- | Returns the label of a code point if it has no character name, otherwise
24+
-- returns @\"UNDEFINED\"@.
25+
--
26+
-- See subsection
27+
-- [“Code Point Labels”](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G135248)
28+
-- in section 4.8 “Name” of the Unicode Standard.
29+
--
30+
-- @since 0.4.0
31+
label :: Char -> IO CStringLen
32+
label c = case codePointType c of
33+
ControlType -> mkLabel 8# "control-"#
34+
PrivateUseType -> mkLabel 12# "private-use-"#
35+
SurrogateType -> mkLabel 10# "surrogate-"#
36+
NoncharacterType -> mkLabel 13# "noncharacter-"#
37+
ReservedType -> mkLabel 9# "reserved-"#
38+
_ -> pure (Ptr "UNDEFINED"#, 9)
39+
40+
where
41+
42+
mkLabel len s0 = allocaArray (I# len + 6) $ \s -> do
43+
copyArray s (Ptr s0) (I# len)
44+
len' <- addHexCodePoint s len len c
45+
pure (s, len')
46+
47+
-- | Appned the code point of a character using the Unicode Standard convention:
48+
-- hexadecimal codepoint padded with zeros if inferior to 4 characters.
49+
--
50+
-- It is the responsability of the caller to provide a 'CString' that can hold
51+
-- up to 6 characters from the provided index.
52+
addHexCodePoint
53+
:: CString -- ^ Destination ASCII string
54+
-> Int# -- ^ String length
55+
-> Int# -- ^ Index
56+
-> Char -- ^ Character which code point will be added to the string
57+
-> IO Int -- ^ New size of the string
58+
addHexCodePoint s len i0 c
59+
| isTrue# (cp# <=# 0x0000f#) = prependAt 3# <* pad0 0# <* pad0 1# <* pad0 2#
60+
| isTrue# (cp# <=# 0x000ff#) = prependAt 3# <* pad0 0# <* pad0 1#
61+
| isTrue# (cp# <=# 0x00fff#) = prependAt 3# <* pad0 0#
62+
| isTrue# (cp# <=# 0x0ffff#) = prependAt 3#
63+
| isTrue# (cp# <=# 0xfffff#) = prependAt 4#
64+
| otherwise = prependAt 5#
65+
where
66+
!(I# cp#) = ord c
67+
pad0 i = pokeElemOff s (I# (i0 +# i)) (CChar 0x30)
68+
prependAt i = go (i0 +# i) (quotRemInt# cp# 16#) $> I# (len +# i +# 1#)
69+
go i (# n#, d #) = do
70+
pokeElemOff s (I# i) (intToDigiT d)
71+
case n# of
72+
0# -> pure ()
73+
_ -> go (i -# 1#) (quotRemInt# n# 16#)
74+
75+
-- | Convert an 'Int#' in the range 0..15 to the corresponding single digit
76+
-- 'CChar' in upper case.
77+
--
78+
-- Undefined for numbers outside the 0..15 range.
79+
intToDigiT :: Int# -> CChar
80+
intToDigiT i = if isTrue# (i <=# 9#)
81+
then fromIntegral (I# (0x30# +# i))
82+
else fromIntegral (I# (0x37# +# i))

unicode-data-names/test/Unicode/Char/General/NamesSpec.hs

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,25 @@ module Unicode.Char.General.NamesSpec
44
( spec
55
) where
66

7-
import GHC.Exts (Char(..), isTrue#, (<#), ord#, andI#)
8-
import Unicode.Char.General
9-
( generalCategory,
10-
GeneralCategory(NotAssigned, Surrogate, PrivateUse) )
11-
import Unicode.Char.General.Names
12-
( NameAliasType (..), correctedName, name, nameOrAlias, nameAliasesWithTypes, nameAliases, nameAliasesByType )
13-
import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName
147
import Data.Foldable (traverse_)
15-
import Test.Hspec ( Spec, it, shouldBe, shouldSatisfy, describe )
8+
import GHC.Exts (Char (..), andI#, isTrue#, ord#, (<#))
9+
import Test.Hspec (Spec, describe, it, shouldBe, shouldSatisfy)
10+
import Unicode.Char.General (
11+
GeneralCategory (NotAssigned, PrivateUse, Surrogate),
12+
generalCategory,
13+
)
14+
15+
import Unicode.Char.General.Names (
16+
NameAliasType (..),
17+
correctedName,
18+
label,
19+
name,
20+
nameAliases,
21+
nameAliasesByType,
22+
nameAliasesWithTypes,
23+
nameOrAlias,
24+
)
25+
import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName
1626

1727
spec :: Spec
1828
spec = do
@@ -132,3 +142,25 @@ spec = do
132142
NotAssigned -> True
133143
_ -> False
134144
traverse_ (`shouldSatisfy` checkName) [minBound..maxBound]
145+
describe "label" do
146+
it "Some characters" do
147+
label '\x0000' `shouldBe` "control-0000"
148+
label '\x009F' `shouldBe` "control-009F"
149+
label 'a' `shouldBe` "UNDEFINED"
150+
label '1' `shouldBe` "UNDEFINED"
151+
label '\x1D0C5' `shouldBe` "UNDEFINED"
152+
label '\x2F89F' `shouldBe` "UNDEFINED"
153+
label '\xE000' `shouldBe` "private-use-E000"
154+
label '\x10FFFD' `shouldBe` "private-use-10FFFD"
155+
label '\xD800' `shouldBe` "surrogate-D800"
156+
label '\xDFFF' `shouldBe` "surrogate-DFFF"
157+
label '\xFDD0' `shouldBe` "noncharacter-FDD0"
158+
label '\x10FFFF' `shouldBe` "noncharacter-10FFFF"
159+
label '\x0378' `shouldBe` "reserved-0378"
160+
label '\x1FFFD' `shouldBe` "reserved-1FFFD"
161+
label '\xEFFFD' `shouldBe` "reserved-EFFFD"
162+
it "Every character has either a name or a label" do
163+
let checkName c = case name c of
164+
Just _ -> True
165+
Nothing -> label c /= "UNDEFINED"
166+
traverse_ (`shouldSatisfy` checkName) [minBound..maxBound]

unicode-data-names/unicode-data-names.cabal

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,16 +94,18 @@ library
9494
-- Generated files
9595
-- This module structure is largely based on the UCD file names from which
9696
-- the properties are generated.
97+
Unicode.Internal.Char.Label
98+
Unicode.Internal.Char.Names.Version
9799
Unicode.Internal.Char.UnicodeData.DerivedName
98100
Unicode.Internal.Char.UnicodeData.NameAliases
99-
Unicode.Internal.Char.Names.Version
100101
other-modules:
101102
-- Internal files
102103
Unicode.Internal.Bits.Names
103104

104105
hs-source-dirs: lib
105106
build-depends:
106107
base >= 4.7 && < 4.21,
108+
unicode-data >= 0.6 && < 0.7
107109
-- Support for raw string literals unpacking is included in base ≥ 4.15
108110
if impl(ghc < 9.0.0)
109111
build-depends:

0 commit comments

Comments
 (0)